diff --git a/.clang-format b/.clang-format index 43032d44e2..3ccecaac1f 100644 --- a/.clang-format +++ b/.clang-format @@ -2,3 +2,10 @@ BasedOnStyle: Google MaxEmptyLinesToKeep: 3 AllowShortIfStatementsOnASingleLine: false AllowShortLoopsOnASingleLine: false +DerivePointerAlignment: false +PointerAlignment: Right +# TODO(davidben): The default for Google style is now Regroup, but the default +# IncludeCategories does not recognize . We should +# reconfigure IncludeCategories to match. For now, keep it at Preserve. +IncludeBlocks: Preserve + diff --git a/.coveralls.yml b/.coveralls.yml new file mode 100644 index 0000000000..cf27a37024 --- /dev/null +++ b/.coveralls.yml @@ -0,0 +1 @@ +service_name: travis-pro diff --git a/.gitattributes b/.gitattributes index bf4e88576e..0271bc950c 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,10 +1,7 @@ * text=auto !eol -*.sln eol=crlf -*.vcxproj eol=crlf -*.vcxproj.filters eol=crlf -*.props eol=crlf -*.bat eol=crlf -*.rc eol=crlf -*.pl linguist-language=Assembly +crypto/**/*.pl linguist-language=Assembly +crypto/perlasm/*.pl linguist-language=Perl *.bin binary *.der binary +**/*.h linguist-language=C +**/*.inl linguist-language=C diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000000..a9261f441f --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,422 @@ +name: ci +on: + pull_request: + push: +jobs: + rustfmt: + # Don't run duplicate `push` jobs for the repo owner's PRs. + if: github.event_name == 'push' || github.event.pull_request.head.repo.full_name != github.repository + + runs-on: ubuntu-18.04 + + steps: + - uses: actions-rs/toolchain@v1 + with: + toolchain: stable + profile: minimal + components: rustfmt + - uses: actions/checkout@v2 + - run: cargo fmt --all -- --check + + clippy: + # Don't run duplicate `push` jobs for the repo owner's PRs. + if: github.event_name == 'push' || github.event.pull_request.head.repo.full_name != github.repository + + runs-on: ubuntu-18.04 + + steps: + - uses: actions-rs/toolchain@v1 + with: + toolchain: stable + profile: minimal + components: clippy + + - uses: actions/checkout@v2 + + - run: cargo clippy --all-features ---all-targets -- --deny warnings + + audit: + # Don't run duplicate `push` jobs for the repo owner's PRs. + if: github.event_name == 'push' || github.event.pull_request.head.repo.full_name != github.repository + + runs-on: ubuntu-18.04 + + steps: + - uses: actions-rs/toolchain@v1 + with: + toolchain: stable + profile: minimal + + - uses: actions/cache@v2 + with: + path: | + ~/.cargo/bin/cargo-audit + ~/.cargo/.crates.toml + ~/.cargo/.crates2.json + key: ${{ runner.os }}-v2-cargo-audit-0.13.1 + + - run: cargo install cargo-audit --vers "0.13.1" + + - uses: actions/checkout@v2 + + - run: cargo generate-lockfile + + - run: cargo audit --deny warnings + + deny: + # Don't run duplicate `push` jobs for the repo owner's PRs. + if: github.event_name == 'push' || github.event.pull_request.head.repo.full_name != github.repository + + runs-on: ubuntu-18.04 + + steps: + - uses: actions-rs/toolchain@v1 + with: + toolchain: stable + profile: minimal + + - uses: actions/cache@v2 + with: + path: | + ~/.cargo/bin/cargo-deny + ~/.cargo/.crates.toml + ~/.cargo/.crates2.json + key: ${{ runner.os }}-v2-cargo-deny-0.8.4 + + - run: cargo install cargo-deny --vers "0.8.4" + + - uses: actions/checkout@v2 + + - run: cargo deny check + + # Verify that documentation builds. + rustdoc: + # Don't run duplicate `push` jobs for the repo owner's PRs. + if: github.event_name == 'push' || github.event.pull_request.head.repo.full_name != github.repository + + runs-on: ubuntu-18.04 + + strategy: + matrix: + rust_channel: + - stable + - beta + - nightly + + include: + - target: x86_64-unknown-linux-gnu + + steps: + - uses: actions-rs/toolchain@v1 + with: + override: true + target: ${{ matrix.target }} + toolchain: ${{ matrix.rust_channel }} + + - uses: actions/checkout@v2 + + - run: | + cargo doc --all-features + + package: + # Don't run duplicate `push` jobs for the repo owner's PRs. + if: github.event_name == 'push' || github.event.pull_request.head.repo.full_name != github.repository + + runs-on: windows-latest + + steps: + - uses: actions/checkout@v2 + + - run: powershell -ExecutionPolicy Bypass ./mk/install-build-tools.ps1 + + - uses: actions-rs/toolchain@v1 + with: + toolchain: stable + profile: minimal + + - run: sh mk/package.sh + shell: bash + + test: + # Don't run duplicate `push` jobs for the repo owner's PRs. + if: github.event_name == 'push' || github.event.pull_request.head.repo.full_name != github.repository + + runs-on: ${{ matrix.host_os }} + + strategy: + matrix: + features: + - # Default + + target: + - aarch64-apple-ios + - aarch64-apple-darwin + - aarch64-linux-android + - aarch64-unknown-linux-gnu + - aarch64-unknown-linux-musl + - arm-unknown-linux-gnueabihf + - armv7-linux-androideabi + - armv7-unknown-linux-musleabihf + - i686-pc-windows-msvc + - i686-unknown-linux-gnu + - i686-unknown-linux-musl + - x86_64-pc-windows-gnu + - x86_64-pc-windows-msvc + - x86_64-apple-darwin + - x86_64-unknown-linux-musl + - x86_64-unknown-linux-gnu + + mode: + - # debug + - --release + + rust_channel: + - stable + - nightly + - 1.37.0 # MSRV + - beta + + exclude: + # The stable channel doesn't have aarch64-apple-darwin support yet. + - target: aarch64-apple-darwin + rust_channel: stable + + # The MSRV channel doesn't have aarch64-apple-darwin support yet. + - target: aarch64-apple-darwin + rust_channel: 1.37.0 + + # Only do MSRV testing on release builds. + - mode: # debug + rust_channel: 1.37.0 + + # 1.37.0 doesn't support `-Clink-self-contained`. + - target: aarch64-unknown-linux-musl + rust_channel: 1.37.0 + + # 1.37.0 doesn't support `-Clink-self-contained`. + - target: armv7-unknown-linux-musleabihf + rust_channel: 1.37.0 + + # 1.37.0 doesn't support `-Clink-self-contained`. + - target: i686-unknown-linux-musl + rust_channel: 1.37.0 + + # 1.37.0 doesn't support `-Clink-self-contained`. + - target: x86_64-unknown-linux-musl + rust_channel: 1.37.0 + + # https://github.com/rust-lang/rust/pull/67429 + - target: x86_64-pc-windows-gnu + rust_channel: 1.37.0 + + include: + - target: aarch64-apple-darwin + # macos-latest didn't work. + host_os: macos-11.0 + # GitHub Actions doesn't have a way to run this target yet. + cargo_options: --no-run + + - target: aarch64-apple-ios + host_os: macos-latest + # GitHub Actions doesn't have a way to run this target yet. + cargo_options: --no-run + + - target: aarch64-linux-android + host_os: ubuntu-18.04 + # TODO: https://github.com/briansmith/ring/issues/486 + cargo_options: --no-run + + - target: aarch64-unknown-linux-gnu + host_os: ubuntu-18.04 + + - target: aarch64-unknown-linux-musl + host_os: ubuntu-18.04 + + - target: arm-unknown-linux-gnueabihf + host_os: ubuntu-18.04 + + - target: armv7-linux-androideabi + host_os: ubuntu-18.04 + # TODO: https://github.com/briansmith/ring/issues/838 + cargo_options: --no-run + + - target: armv7-unknown-linux-musleabihf + host_os: ubuntu-18.04 + # TODO: https://github.com/briansmith/ring/issues/1115 + cargo_options: --no-run + + - target: i686-pc-windows-msvc + host_os: windows-latest + + - target: i686-unknown-linux-gnu + host_os: ubuntu-18.04 + + - target: i686-unknown-linux-musl + host_os: ubuntu-18.04 + + - target: x86_64-pc-windows-gnu + host_os: windows-latest + + - target: x86_64-pc-windows-msvc + host_os: windows-latest + + - target: x86_64-apple-darwin + host_os: macos-latest + + - target: x86_64-unknown-linux-musl + host_os: ubuntu-18.04 + + - target: x86_64-unknown-linux-gnu + host_os: ubuntu-18.04 + + steps: + - if: ${{ contains(matrix.host_os, 'ubuntu') }} + run: sudo apt-get update -y + + - uses: actions/checkout@v2 + + - if: ${{ !contains(matrix.host_os, 'windows') }} + run: mk/install-build-tools.sh --target=${{ matrix.target }} ${{ matrix.features }} + + - if: ${{ contains(matrix.host_os, 'windows') }} + run: > + (powershell -ExecutionPolicy Bypass ./mk/install-build-tools.ps1) -and + ("$pwd\target\tools" >> $env:GITHUB_PATH) + + - uses: actions-rs/toolchain@v1 + with: + override: true + target: ${{ matrix.target }} + toolchain: ${{ matrix.rust_channel }} + + - if: ${{ matrix.target == 'aarch64-apple-darwin' }} + run: echo "DEVELOPER_DIR=/Applications/Xcode_12.2.app/Contents/Developer" >> $GITHUB_ENV + + - if: ${{ !contains(matrix.host_os, 'windows') }} + run: | + mk/cargo.sh test -vv --target=${{ matrix.target }} ${{ matrix.cargo_options }} ${{ matrix.features }} ${{ matrix.mode }} + + - if: ${{ contains(matrix.host_os, 'windows') }} + run: | + cargo test -vv --target=${{ matrix.target }} ${{ matrix.cargo_options }} ${{ matrix.features }} ${{ matrix.mode }} + + # The wasm32-unknown-unknown targets have a different set of feature sets and + # an additional `webdriver` dimension. + test-wasm32: + # Don't run duplicate `push` jobs for the repo owner's PRs. + if: github.event_name == 'push' || github.event.pull_request.head.repo.full_name != github.repository + + runs-on: ${{ matrix.host_os }} + + strategy: + matrix: + features: + - # Default + - --features=wasm32_c + host_os: + - ubuntu-18.04 + mode: + - # debug + - --release + rust_channel: + - stable + - beta + - nightly + target: + - wasm32-unknown-unknown + webdriver: + - GECKODRIVER=geckodriver + - CHROMEDRIVER=chromedriver + + steps: + - if: ${{ contains(matrix.host_os, 'ubuntu') }} + run: sudo apt-get update -y + + - uses: actions/checkout@v2 + + - run: cargo generate-lockfile + + - run: mk/install-build-tools.sh --target=${{ matrix.target }} ${{ matrix.features }} + + - uses: actions-rs/toolchain@v1 + with: + override: true + target: ${{ matrix.target }} + toolchain: ${{ matrix.rust_channel }} + + - run: | + ${{ matrix.webdriver }} mk/cargo.sh test -vv --target=${{ matrix.target }} ${{ matrix.features }} ${{ matrix.mode }} + + coverage: + # Don't run duplicate `push` jobs for the repo owner's PRs. + if: github.event_name == 'push' || github.event.pull_request.head.repo.full_name != github.repository + + runs-on: ${{ matrix.host_os }} + + strategy: + matrix: + features: + - --all-features + + # TODO: targets + target: + - aarch64-unknown-linux-gnu + - i686-unknown-linux-gnu + - x86_64-unknown-linux-musl + + mode: + - # debug + + # Coverage collection is Nightly-only + rust_channel: + - nightly + + # TODO: targets + include: + # TODO: Use the -musl target after + # https://github.com/rust-lang/rust/issues/79556 and + # https://github.com/rust-lang/rust/issues/79555 are fixed. + - target: aarch64-unknown-linux-gnu + host_os: ubuntu-18.04 + + # TODO: Use the -musl target after + # https://github.com/rust-lang/rust/issues/79556 and + # https://github.com/rust-lang/rust/issues/79555 are fixed. + - target: i686-unknown-linux-gnu + host_os: ubuntu-18.04 + + - target: x86_64-unknown-linux-musl + host_os: ubuntu-18.04 + + # TODO: Add an ARM target after + # https://github.com/rust-lang/rust/issues/79555 is fixed. This may + # require https://github.com/rust-lang/rust/issues/79555 to be fixed + # too. + + steps: + - if: ${{ contains(matrix.host_os, 'ubuntu') }} + run: sudo apt-get update -y + + - uses: actions/checkout@v2 + + - if: ${{ !contains(matrix.host_os, 'windows') }} + run: RING_COVERAGE=1 mk/install-build-tools.sh --target=${{ matrix.target }} ${{ matrix.features }} + + - uses: actions-rs/toolchain@v1 + with: + override: true + target: ${{ matrix.target }} + toolchain: ${{ matrix.rust_channel }} + + - if: ${{ matrix.target == 'aarch64-apple-darwin' }} + run: echo "DEVELOPER_DIR=/Applications/Xcode_12.2.app/Contents/Developer" >> $GITHUB_ENV + + - if: ${{ !contains(matrix.host_os, 'windows') }} + run: | + RING_COVERAGE=1 mk/cargo.sh +${{ matrix.rust_channel }} test -vv --target=${{ matrix.target }} ${{ matrix.cargo_options }} ${{ matrix.features }} ${{ matrix.mode }} + + - uses: codecov/codecov-action@v1 + with: + directory: ./target/${{ matrix.target }}/debug/coverage/reports + fail_ci_if_error: true + verbose: true diff --git a/.gitignore b/.gitignore index 7219d6bd38..3b63d5972c 100644 --- a/.gitignore +++ b/.gitignore @@ -7,29 +7,6 @@ ssl/test/runner/runner doc/*.html doc/doc.css -util/bot/android_ndk -util/bot/android_tools -util/bot/cmake-linux64 -util/bot/cmake-linux64.tar.gz -util/bot/cmake-mac -util/bot/cmake-mac.tar.gz -util/bot/cmake-win32 -util/bot/cmake-win32.zip -util/bot/golang -util/bot/gyp -util/bot/libcxx -util/bot/libcxxabi -util/bot/llvm-build -util/bot/nasm-win32.exe -util/bot/perl-win32 -util/bot/perl-win32.zip -util/bot/sde-linux64 -util/bot/sde-linux64.tar.bz2 -util/bot/sde-win32 -util/bot/sde-win32.tar.bz2 -util/bot/win_toolchain.json -util/bot/yasm-win32.exe - *.bk *.orig *~ diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index dbbe8ac24a..0000000000 --- a/.travis.yml +++ /dev/null @@ -1,683 +0,0 @@ -language: rust -cache: - directories: - - $HOME/kcov-i686-unknown-linux-gnu - - $HOME/kcov-x86_64-unknown-linux-gnu -matrix: - fast_finish: true - allow_failures: - - rust: nightly - include: - # The lines from "# BEGIN GENERATED" through "# END GENERATED" are - # generated by running |python mk/update-travis-yml.py|. Any changes - # made to those lines will be overwritten while other lines will be left - # untouched. - # - # BEGIN GENERATED - - - env: TARGET_X=x86_64-apple-darwin FEATURES_X= MODE_X=DEBUG KCOV=0 RUST_X=stable - rust: stable - os: osx - osx_image: xcode10.1 - - - env: TARGET_X=x86_64-apple-darwin FEATURES_X= MODE_X=RELWITHDEBINFO KCOV=0 RUST_X=stable - rust: stable - os: osx - osx_image: xcode10.1 - - - env: TARGET_X=aarch64-linux-android CC_X=aarch64-linux-android21-clang FEATURES_X= MODE_X=DEBUG KCOV=0 RUST_X=stable - rust: stable - os: linux - language: android - android: - components: - - android-21 - - build-tools-26.0.2 - dist: trusty - - - env: TARGET_X=aarch64-linux-android CC_X=aarch64-linux-android21-clang FEATURES_X= MODE_X=RELWITHDEBINFO KCOV=0 RUST_X=stable - rust: stable - os: linux - language: android - android: - components: - - android-21 - - build-tools-26.0.2 - dist: trusty - - - env: TARGET_X=armv7-linux-androideabi CC_X=armv7a-linux-androideabi18-clang FEATURES_X= MODE_X=DEBUG KCOV=0 RUST_X=stable - rust: stable - os: linux - language: android - android: - components: - - android-18 - - build-tools-26.0.2 - - sys-img-armeabi-v7a-android-18 - dist: trusty - - - env: TARGET_X=armv7-linux-androideabi CC_X=armv7a-linux-androideabi18-clang FEATURES_X= MODE_X=RELWITHDEBINFO KCOV=0 RUST_X=stable - rust: stable - os: linux - language: android - android: - components: - - android-18 - - build-tools-26.0.2 - - sys-img-armeabi-v7a-android-18 - dist: trusty - - - env: TARGET_X=x86_64-unknown-linux-gnu FEATURES_X= MODE_X=DEBUG KCOV=0 RUST_X=stable - rust: stable - os: linux - dist: trusty - - - env: TARGET_X=x86_64-unknown-linux-gnu FEATURES_X= MODE_X=RELWITHDEBINFO KCOV=0 RUST_X=stable - rust: stable - os: linux - dist: trusty - - - env: TARGET_X=x86_64-unknown-linux-gnu CC_X=clang FEATURES_X= MODE_X=DEBUG KCOV=0 RUST_X=stable - rust: stable - os: linux - dist: trusty - - - env: TARGET_X=x86_64-unknown-linux-gnu CC_X=clang FEATURES_X= MODE_X=RELWITHDEBINFO KCOV=0 RUST_X=stable - rust: stable - os: linux - dist: trusty - - - env: TARGET_X=x86_64-unknown-linux-gnu CC_X=gcc-7 FEATURES_X= MODE_X=DEBUG KCOV=0 RUST_X=stable - rust: stable - os: linux - dist: trusty - addons: - apt: - packages: - - gcc-7 - sources: - - ubuntu-toolchain-r-test - - - env: TARGET_X=x86_64-unknown-linux-gnu CC_X=gcc-7 FEATURES_X= MODE_X=RELWITHDEBINFO KCOV=0 RUST_X=stable - rust: stable - os: linux - dist: trusty - addons: - apt: - packages: - - gcc-7 - sources: - - ubuntu-toolchain-r-test - - - env: TARGET_X=aarch64-unknown-linux-gnu CC_X=aarch64-linux-gnu-gcc FEATURES_X= MODE_X=DEBUG KCOV=0 RUST_X=stable - rust: stable - os: linux - dist: trusty - addons: - apt: - packages: - - gcc-aarch64-linux-gnu - - libc6-dev-arm64-cross - sources: - - ubuntu-toolchain-r-test - - - env: TARGET_X=aarch64-unknown-linux-gnu CC_X=aarch64-linux-gnu-gcc FEATURES_X= MODE_X=RELWITHDEBINFO KCOV=0 RUST_X=stable - rust: stable - os: linux - dist: trusty - addons: - apt: - packages: - - gcc-aarch64-linux-gnu - - libc6-dev-arm64-cross - sources: - - ubuntu-toolchain-r-test - - - env: TARGET_X=i686-unknown-linux-gnu FEATURES_X= MODE_X=DEBUG KCOV=0 RUST_X=stable - rust: stable - os: linux - dist: trusty - addons: - apt: - packages: - - gcc-multilib - - libc6-dev-i386 - sources: - - ubuntu-toolchain-r-test - - - env: TARGET_X=i686-unknown-linux-gnu FEATURES_X= MODE_X=RELWITHDEBINFO KCOV=0 RUST_X=stable - rust: stable - os: linux - dist: trusty - addons: - apt: - packages: - - gcc-multilib - - libc6-dev-i386 - sources: - - ubuntu-toolchain-r-test - - - env: TARGET_X=i686-unknown-linux-gnu CC_X=clang FEATURES_X= MODE_X=DEBUG KCOV=0 RUST_X=stable - rust: stable - os: linux - dist: trusty - addons: - apt: - packages: - - gcc-multilib - - libc6-dev-i386 - sources: - - ubuntu-toolchain-r-test - - - env: TARGET_X=i686-unknown-linux-gnu CC_X=clang FEATURES_X= MODE_X=RELWITHDEBINFO KCOV=0 RUST_X=stable - rust: stable - os: linux - dist: trusty - addons: - apt: - packages: - - gcc-multilib - - libc6-dev-i386 - sources: - - ubuntu-toolchain-r-test - - - env: TARGET_X=i686-unknown-linux-gnu CC_X=gcc-7 FEATURES_X= MODE_X=DEBUG KCOV=0 RUST_X=stable - rust: stable - os: linux - dist: trusty - addons: - apt: - packages: - - gcc-7 - - gcc-7-multilib - - linux-libc-dev:i386 - sources: - - ubuntu-toolchain-r-test - - - env: TARGET_X=i686-unknown-linux-gnu CC_X=gcc-7 FEATURES_X= MODE_X=RELWITHDEBINFO KCOV=0 RUST_X=stable - rust: stable - os: linux - dist: trusty - addons: - apt: - packages: - - gcc-7 - - gcc-7-multilib - - linux-libc-dev:i386 - sources: - - ubuntu-toolchain-r-test - - - env: TARGET_X=arm-unknown-linux-gnueabihf CC_X=arm-linux-gnueabihf-gcc FEATURES_X= MODE_X=DEBUG KCOV=0 RUST_X=stable - rust: stable - os: linux - dist: trusty - addons: - apt: - packages: - - gcc-arm-linux-gnueabihf - - libc6-dev-armhf-cross - sources: - - ubuntu-toolchain-r-test - - - env: TARGET_X=arm-unknown-linux-gnueabihf CC_X=arm-linux-gnueabihf-gcc FEATURES_X= MODE_X=RELWITHDEBINFO KCOV=0 RUST_X=stable - rust: stable - os: linux - dist: trusty - addons: - apt: - packages: - - gcc-arm-linux-gnueabihf - - libc6-dev-armhf-cross - sources: - - ubuntu-toolchain-r-test - - - env: TARGET_X=x86_64-apple-darwin FEATURES_X= MODE_X=DEBUG KCOV=0 RUST_X=nightly - rust: nightly - os: osx - osx_image: xcode10.1 - - - env: TARGET_X=x86_64-apple-darwin FEATURES_X= MODE_X=RELWITHDEBINFO KCOV=0 RUST_X=nightly - rust: nightly - os: osx - osx_image: xcode10.1 - - - env: TARGET_X=aarch64-linux-android CC_X=aarch64-linux-android21-clang FEATURES_X= MODE_X=DEBUG KCOV=0 RUST_X=nightly - rust: nightly - os: linux - language: android - android: - components: - - android-21 - - build-tools-26.0.2 - dist: trusty - - - env: TARGET_X=aarch64-linux-android CC_X=aarch64-linux-android21-clang FEATURES_X= MODE_X=RELWITHDEBINFO KCOV=0 RUST_X=nightly - rust: nightly - os: linux - language: android - android: - components: - - android-21 - - build-tools-26.0.2 - dist: trusty - - - env: TARGET_X=armv7-linux-androideabi CC_X=armv7a-linux-androideabi18-clang FEATURES_X= MODE_X=DEBUG KCOV=0 RUST_X=nightly - rust: nightly - os: linux - language: android - android: - components: - - android-18 - - build-tools-26.0.2 - - sys-img-armeabi-v7a-android-18 - dist: trusty - - - env: TARGET_X=armv7-linux-androideabi CC_X=armv7a-linux-androideabi18-clang FEATURES_X= MODE_X=RELWITHDEBINFO KCOV=0 RUST_X=nightly - rust: nightly - os: linux - language: android - android: - components: - - android-18 - - build-tools-26.0.2 - - sys-img-armeabi-v7a-android-18 - dist: trusty - - - env: TARGET_X=x86_64-unknown-linux-gnu FEATURES_X= MODE_X=DEBUG KCOV=0 RUST_X=nightly - rust: nightly - os: linux - dist: trusty - - - env: TARGET_X=x86_64-unknown-linux-gnu FEATURES_X= MODE_X=RELWITHDEBINFO KCOV=0 RUST_X=nightly - rust: nightly - os: linux - dist: trusty - - - env: TARGET_X=x86_64-unknown-linux-gnu CC_X=clang FEATURES_X= MODE_X=DEBUG KCOV=0 RUST_X=nightly - rust: nightly - os: linux - dist: trusty - - - env: TARGET_X=x86_64-unknown-linux-gnu CC_X=clang FEATURES_X= MODE_X=RELWITHDEBINFO KCOV=0 RUST_X=nightly - rust: nightly - os: linux - dist: trusty - - - env: TARGET_X=x86_64-unknown-linux-gnu CC_X=gcc-7 FEATURES_X= MODE_X=DEBUG KCOV=1 RUST_X=nightly - rust: nightly - os: linux - dist: trusty - addons: - apt: - packages: - - binutils-dev - - g++-7 - - gcc-7 - - libcurl4-openssl-dev - - libdw-dev - - libelf-dev - - libiberty-dev - sources: - - ubuntu-toolchain-r-test - - - env: TARGET_X=x86_64-unknown-linux-gnu CC_X=gcc-7 FEATURES_X= MODE_X=RELWITHDEBINFO KCOV=0 RUST_X=nightly - rust: nightly - os: linux - dist: trusty - addons: - apt: - packages: - - gcc-7 - sources: - - ubuntu-toolchain-r-test - - - env: TARGET_X=aarch64-unknown-linux-gnu CC_X=aarch64-linux-gnu-gcc FEATURES_X= MODE_X=DEBUG KCOV=0 RUST_X=nightly - rust: nightly - os: linux - dist: trusty - addons: - apt: - packages: - - gcc-aarch64-linux-gnu - - libc6-dev-arm64-cross - sources: - - ubuntu-toolchain-r-test - - - env: TARGET_X=aarch64-unknown-linux-gnu CC_X=aarch64-linux-gnu-gcc FEATURES_X= MODE_X=RELWITHDEBINFO KCOV=0 RUST_X=nightly - rust: nightly - os: linux - dist: trusty - addons: - apt: - packages: - - gcc-aarch64-linux-gnu - - libc6-dev-arm64-cross - sources: - - ubuntu-toolchain-r-test - - - env: TARGET_X=i686-unknown-linux-gnu FEATURES_X= MODE_X=DEBUG KCOV=0 RUST_X=nightly - rust: nightly - os: linux - dist: trusty - addons: - apt: - packages: - - gcc-multilib - - libc6-dev-i386 - sources: - - ubuntu-toolchain-r-test - - - env: TARGET_X=i686-unknown-linux-gnu FEATURES_X= MODE_X=RELWITHDEBINFO KCOV=0 RUST_X=nightly - rust: nightly - os: linux - dist: trusty - addons: - apt: - packages: - - gcc-multilib - - libc6-dev-i386 - sources: - - ubuntu-toolchain-r-test - - - env: TARGET_X=i686-unknown-linux-gnu CC_X=clang FEATURES_X= MODE_X=DEBUG KCOV=0 RUST_X=nightly - rust: nightly - os: linux - dist: trusty - addons: - apt: - packages: - - gcc-multilib - - libc6-dev-i386 - sources: - - ubuntu-toolchain-r-test - - - env: TARGET_X=i686-unknown-linux-gnu CC_X=clang FEATURES_X= MODE_X=RELWITHDEBINFO KCOV=0 RUST_X=nightly - rust: nightly - os: linux - dist: trusty - addons: - apt: - packages: - - gcc-multilib - - libc6-dev-i386 - sources: - - ubuntu-toolchain-r-test - - - env: TARGET_X=i686-unknown-linux-gnu CC_X=gcc-7 FEATURES_X= MODE_X=DEBUG KCOV=1 RUST_X=nightly - rust: nightly - os: linux - dist: trusty - addons: - apt: - packages: - - g++-7 - - g++-7-multilib - - gcc-7 - - gcc-7-multilib - - libcurl3:i386 - - libcurl4-openssl-dev:i386 - - libdw-dev:i386 - - libelf-dev:i386 - - libiberty-dev:i386 - - libkrb5-dev:i386 - - libssl-dev:i386 - - linux-libc-dev:i386 - sources: - - ubuntu-toolchain-r-test - - - env: TARGET_X=i686-unknown-linux-gnu CC_X=gcc-7 FEATURES_X= MODE_X=RELWITHDEBINFO KCOV=0 RUST_X=nightly - rust: nightly - os: linux - dist: trusty - addons: - apt: - packages: - - gcc-7 - - gcc-7-multilib - - linux-libc-dev:i386 - sources: - - ubuntu-toolchain-r-test - - - env: TARGET_X=arm-unknown-linux-gnueabihf CC_X=arm-linux-gnueabihf-gcc FEATURES_X= MODE_X=DEBUG KCOV=0 RUST_X=nightly - rust: nightly - os: linux - dist: trusty - addons: - apt: - packages: - - gcc-arm-linux-gnueabihf - - libc6-dev-armhf-cross - sources: - - ubuntu-toolchain-r-test - - - env: TARGET_X=arm-unknown-linux-gnueabihf CC_X=arm-linux-gnueabihf-gcc FEATURES_X= MODE_X=RELWITHDEBINFO KCOV=0 RUST_X=nightly - rust: nightly - os: linux - dist: trusty - addons: - apt: - packages: - - gcc-arm-linux-gnueabihf - - libc6-dev-armhf-cross - sources: - - ubuntu-toolchain-r-test - - - env: TARGET_X=x86_64-apple-darwin FEATURES_X= MODE_X=DEBUG KCOV=0 RUST_X=beta - rust: beta - os: osx - osx_image: xcode10.1 - - - env: TARGET_X=x86_64-apple-darwin FEATURES_X= MODE_X=RELWITHDEBINFO KCOV=0 RUST_X=beta - rust: beta - os: osx - osx_image: xcode10.1 - - - env: TARGET_X=aarch64-linux-android CC_X=aarch64-linux-android21-clang FEATURES_X= MODE_X=DEBUG KCOV=0 RUST_X=beta - rust: beta - os: linux - language: android - android: - components: - - android-21 - - build-tools-26.0.2 - dist: trusty - - - env: TARGET_X=aarch64-linux-android CC_X=aarch64-linux-android21-clang FEATURES_X= MODE_X=RELWITHDEBINFO KCOV=0 RUST_X=beta - rust: beta - os: linux - language: android - android: - components: - - android-21 - - build-tools-26.0.2 - dist: trusty - - - env: TARGET_X=armv7-linux-androideabi CC_X=armv7a-linux-androideabi18-clang FEATURES_X= MODE_X=DEBUG KCOV=0 RUST_X=beta - rust: beta - os: linux - language: android - android: - components: - - android-18 - - build-tools-26.0.2 - - sys-img-armeabi-v7a-android-18 - dist: trusty - - - env: TARGET_X=armv7-linux-androideabi CC_X=armv7a-linux-androideabi18-clang FEATURES_X= MODE_X=RELWITHDEBINFO KCOV=0 RUST_X=beta - rust: beta - os: linux - language: android - android: - components: - - android-18 - - build-tools-26.0.2 - - sys-img-armeabi-v7a-android-18 - dist: trusty - - - env: TARGET_X=x86_64-unknown-linux-gnu FEATURES_X= MODE_X=DEBUG KCOV=0 RUST_X=beta - rust: beta - os: linux - dist: trusty - - - env: TARGET_X=x86_64-unknown-linux-gnu FEATURES_X= MODE_X=RELWITHDEBINFO KCOV=0 RUST_X=beta - rust: beta - os: linux - dist: trusty - - - env: TARGET_X=x86_64-unknown-linux-gnu CC_X=clang FEATURES_X= MODE_X=DEBUG KCOV=0 RUST_X=beta - rust: beta - os: linux - dist: trusty - - - env: TARGET_X=x86_64-unknown-linux-gnu CC_X=clang FEATURES_X= MODE_X=RELWITHDEBINFO KCOV=0 RUST_X=beta - rust: beta - os: linux - dist: trusty - - - env: TARGET_X=x86_64-unknown-linux-gnu CC_X=gcc-7 FEATURES_X= MODE_X=DEBUG KCOV=0 RUST_X=beta - rust: beta - os: linux - dist: trusty - addons: - apt: - packages: - - gcc-7 - sources: - - ubuntu-toolchain-r-test - - - env: TARGET_X=x86_64-unknown-linux-gnu CC_X=gcc-7 FEATURES_X= MODE_X=RELWITHDEBINFO KCOV=0 RUST_X=beta - rust: beta - os: linux - dist: trusty - addons: - apt: - packages: - - gcc-7 - sources: - - ubuntu-toolchain-r-test - - - env: TARGET_X=aarch64-unknown-linux-gnu CC_X=aarch64-linux-gnu-gcc FEATURES_X= MODE_X=DEBUG KCOV=0 RUST_X=beta - rust: beta - os: linux - dist: trusty - addons: - apt: - packages: - - gcc-aarch64-linux-gnu - - libc6-dev-arm64-cross - sources: - - ubuntu-toolchain-r-test - - - env: TARGET_X=aarch64-unknown-linux-gnu CC_X=aarch64-linux-gnu-gcc FEATURES_X= MODE_X=RELWITHDEBINFO KCOV=0 RUST_X=beta - rust: beta - os: linux - dist: trusty - addons: - apt: - packages: - - gcc-aarch64-linux-gnu - - libc6-dev-arm64-cross - sources: - - ubuntu-toolchain-r-test - - - env: TARGET_X=i686-unknown-linux-gnu FEATURES_X= MODE_X=DEBUG KCOV=0 RUST_X=beta - rust: beta - os: linux - dist: trusty - addons: - apt: - packages: - - gcc-multilib - - libc6-dev-i386 - sources: - - ubuntu-toolchain-r-test - - - env: TARGET_X=i686-unknown-linux-gnu FEATURES_X= MODE_X=RELWITHDEBINFO KCOV=0 RUST_X=beta - rust: beta - os: linux - dist: trusty - addons: - apt: - packages: - - gcc-multilib - - libc6-dev-i386 - sources: - - ubuntu-toolchain-r-test - - - env: TARGET_X=i686-unknown-linux-gnu CC_X=clang FEATURES_X= MODE_X=DEBUG KCOV=0 RUST_X=beta - rust: beta - os: linux - dist: trusty - addons: - apt: - packages: - - gcc-multilib - - libc6-dev-i386 - sources: - - ubuntu-toolchain-r-test - - - env: TARGET_X=i686-unknown-linux-gnu CC_X=clang FEATURES_X= MODE_X=RELWITHDEBINFO KCOV=0 RUST_X=beta - rust: beta - os: linux - dist: trusty - addons: - apt: - packages: - - gcc-multilib - - libc6-dev-i386 - sources: - - ubuntu-toolchain-r-test - - - env: TARGET_X=i686-unknown-linux-gnu CC_X=gcc-7 FEATURES_X= MODE_X=DEBUG KCOV=0 RUST_X=beta - rust: beta - os: linux - dist: trusty - addons: - apt: - packages: - - gcc-7 - - gcc-7-multilib - - linux-libc-dev:i386 - sources: - - ubuntu-toolchain-r-test - - - env: TARGET_X=i686-unknown-linux-gnu CC_X=gcc-7 FEATURES_X= MODE_X=RELWITHDEBINFO KCOV=0 RUST_X=beta - rust: beta - os: linux - dist: trusty - addons: - apt: - packages: - - gcc-7 - - gcc-7-multilib - - linux-libc-dev:i386 - sources: - - ubuntu-toolchain-r-test - - - env: TARGET_X=arm-unknown-linux-gnueabihf CC_X=arm-linux-gnueabihf-gcc FEATURES_X= MODE_X=DEBUG KCOV=0 RUST_X=beta - rust: beta - os: linux - dist: trusty - addons: - apt: - packages: - - gcc-arm-linux-gnueabihf - - libc6-dev-armhf-cross - sources: - - ubuntu-toolchain-r-test - - - env: TARGET_X=arm-unknown-linux-gnueabihf CC_X=arm-linux-gnueabihf-gcc FEATURES_X= MODE_X=RELWITHDEBINFO KCOV=0 RUST_X=beta - rust: beta - os: linux - dist: trusty - addons: - apt: - packages: - - gcc-arm-linux-gnueabihf - - libc6-dev-armhf-cross - sources: - - ubuntu-toolchain-r-test - - # END GENERATED - -script: if [[ "$TARGET_X" =~ ^a*.*linux-.* && "$MODE_X" == "RELWITHDEBINFO" ]]; then travis_wait 60 mk/travis.sh; else mk/travis.sh; fi diff --git a/BUILDING.md b/BUILDING.md index ab6d89558c..e5f6b7e548 100644 --- a/BUILDING.md +++ b/BUILDING.md @@ -26,15 +26,16 @@ Builds directly from Git ------------------------ If you want to hack on *ring* then you need to build it directly from its Git -repository. In this case, you must also have Perl installed, because the -assembly language modules inherited from BoringSSL (inherited from OpenSSL) -use Perl as a macro assembly language. +repository. There are some additional requirements for doing this that do not +apply when building from crates.io: -When building from Git for Windows, directories containing yasm.exe and -perl.exe must be in `%PATH%`, where yasm.exe is -[Yasm](http://yasm.tortall.net/Download.html) 1.3 or later and where perl.exe -is recommended to be [Strawberry Perl](http://strawberryperl.com). +* For any target for which *ring* has assembly language implementations of + primitives (32- and 64- bit Intel, and 32- and 64-bit ARM), Perl must be + installed and in `$PATH`. +* For Windows targets, `target/tools/nasm[.exe]` is used as the assembler; + [mk/install-build-tools.ps1](mk/install-build-tools.ps1) downloads it for + Windows hosts. Cross Compiling --------------- @@ -79,11 +80,6 @@ e.g. export `CFLAGS=-D__ANDROID_API__=21`. Additional Features that are Useful for Development --------------------------------------------------- -The `internal_benches` feature enable benchmarks of internal functions. These -benchmarks are only useful for people hacking on the implementation of *ring*. -(The benchmarks for the *ring* API are in the -[crypto-bench](https://github.com/briansmith/crypto-bench) project.) - The `slow_tests` feature runs additional tests that are too slow to run during a normal edit-compile-test cycle. diff --git a/Cargo.toml b/Cargo.toml index 275d42c621..bef69f9932 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,7 +10,7 @@ license-file = "LICENSE" name = "ring" readme = "doc/link-to-readme.md" repository = "https://github.com/briansmith/ring" -version = "0.16.15" +version = "0.16.19" # Prevent multiple versions of *ring* from being linked into the same program. links = "ring-asm" @@ -72,6 +72,7 @@ include = [ "crypto/fipsmodule/modes/asm/ghash-x86.pl", "crypto/fipsmodule/modes/asm/ghash-x86_64.pl", "crypto/fipsmodule/modes/asm/ghashv8-armx.pl", + "crypto/fipsmodule/modes/gcm.c", "crypto/fipsmodule/modes/internal.h", "crypto/fipsmodule/rand/asm/rdrand-x86_64.pl", "crypto/fipsmodule/sha/asm/sha256-armv4.pl", @@ -88,10 +89,11 @@ include = [ "crypto/perlasm/x86gas.pl", "crypto/perlasm/x86nasm.pl", "crypto/perlasm/x86_64-xlate.pl", - "crypto/poly1305/asm/poly1305-armv4.pl", - "crypto/poly1305/asm/poly1305-armv8.pl", - "crypto/poly1305/asm/poly1305-x86.pl", - "crypto/poly1305/asm/poly1305-x86_64.pl", + "crypto/poly1305/internal.h", + "crypto/poly1305/poly1305.c", + "crypto/poly1305/poly1305_arm.c", + "crypto/poly1305/poly1305_arm_asm.S", + "crypto/poly1305/poly1305_vec.c", "doc/link-to-readme.md", "examples/checkdigest.rs", "include/GFp/aes.h", @@ -100,6 +102,7 @@ include = [ "include/GFp/check.h", "include/GFp/cpu.h", "include/GFp/mem.h", + "include/GFp/poly1305.h", "include/GFp/type_check.h", "src/aead.rs", "src/aead/aes.rs", @@ -299,14 +302,15 @@ name = "ring" [dependencies] untrusted = { version = "0.7.1" } -[target.'cfg(all(any(target_arch = "aarch64", target_arch = "arm", target_arch = "x86", target_arch = "x86_64"), not(target_os = "ios")))'.dependencies] +[target.'cfg(any(target_arch = "x86",target_arch = "x86_64", all(any(target_arch = "aarch64", target_arch = "arm"), any(target_os = "android", target_os = "fuchsia", target_os = "linux"))))'.dependencies] spin = { version = "0.5.2", default-features = false } [target.'cfg(any(target_os = "android", target_os = "linux"))'.dependencies] libc = { version = "0.2.69", default-features = false } +once_cell = { version = "1.5.2", default-features = false, features=["std"], optional = true } -[target.'cfg(any(target_os = "android", target_os = "freebsd", target_os = "linux", target_os = "netbsd", target_os = "openbsd", target_os = "solaris"))'.dependencies] -once_cell = { version = "1.3.1", default-features = false, features=["std"], optional = true } +[target.'cfg(any(target_os = "dragonfly", target_os = "freebsd", target_os = "illumos", target_os = "netbsd", target_os = "openbsd", target_os = "solaris"))'.dependencies] +once_cell = { version = "1.5.2", default-features = false, features=["std"] } [target.'cfg(all(target_arch = "wasm32", target_vendor = "unknown", target_os = "unknown", target_env = ""))'.dependencies] web-sys = { version = "0.3.37", default-features = false, features = ["Crypto", "Window"] } @@ -315,14 +319,14 @@ web-sys = { version = "0.3.37", default-features = false, features = ["Crypto", winapi = { version = "0.3.8", default-features = false, features = ["ntsecapi", "wtypesbase"] } [target.'cfg(target_arch = "wasm32")'.dev-dependencies] -wasm-bindgen-test = { version = "0.3.10", default-features = false } +wasm-bindgen-test = { version = "0.3.18", default-features = false } [target.'cfg(any(unix, windows))'.dev-dependencies] -libc = { version = "0.2.69", default-features = false } +libc = { version = "0.2.80", default-features = false } # Keep this in sync with `[dependencies]` in pregenerate_asm/Cargo.toml. [build-dependencies] -cc = { version = "1.0.41", default-features = false } +cc = { version = "1.0.62", default-features = false } [features] # These features are documented in the top-level module's documentation. diff --git a/README.md b/README.md index 50c8b77722..47ae07be03 100644 --- a/README.md +++ b/README.md @@ -131,7 +131,7 @@ Users of *ring* should always use the latest released version, and users should upgrade to the latest released version as soon as it is released. *ring* has a linear release model that favors users of the latest released version. We have never backported fixes to earlier releases and we don't -maintain branches other than the master branch. Further, for some obscure +maintain branches other than the main branch. Further, for some obscure technical reasons it's currently not possible to link two different versions of *ring* into the same program; for policy reasons we don't bother to try to work around that. Thus it is important that libraries using *ring* update @@ -169,8 +169,8 @@ source libraries use. The idea behind *our* model is to encourage all users to work together to ensure that the latest version is good *as it is being developed*. In particular, because users know that correctness/security fixes (if any) aren't going to get backported, they have a strong incentive to help -review pull requests before they are merged and/or review commits on the master -branch after they've landed to ensure that code quality on the master branch +review pull requests before they are merged and/or review commits on the main +branch after they've landed to ensure that code quality on the main branch stays high. The more common model, where there are stable versions that have important @@ -198,41 +198,31 @@ any security vulnerability in this code privately to anybody.** Online Automated Testing ------------------------ -Travis CI is used for Android, Linux, and macOS. Appveyor is used for Windows. -The tests are run in debug and release configurations, for the current release -of each Rust channel (Stable, Beta, Nightly), for each configuration listed in -the table below. The C compilers listed are used for compiling the C portions. - - - - - - - - - - - - - - - - - - - - - - - - -
OSArch.CompilersStatus
Linuxx86, x86_64GCC 4.8, GCC 7, Clang 5Build Status
32‑bit ARM, AAarch64GCC (Ubuntu/Linaro 4.8.4-2ubuntu1~14.04.1), tested using - qemu-user-arm.
AndroidARMv7, Aarch64*ring* for ARMv7 Android is built in CI using SDK version 26 targeting - API level 18 (Android 4.3+); it is tested in the emulator using the - corresponding system image. *ring* for AArch64 Android is built in CI - using SDK version 26 targeting API level 21 (Android 5.0).
Mac OS Xx64Apple LLVM version 9.0.0 (clang-900.0.39.2) from Xcode 9.3
Windowsx86, x86_64MSVC 2015 Update 3 (14.0)Build Status
- - +The following targets are tested in GitHub Actions. The tests are run in debug +and release configurations, for the current release of each Rust channel +(Stable, Beta, Nightly). A C compiler is currently required to compile some +parts of *ring*; *ring* should be compatible with GCC 4.8+, Clang 10+, and MSVC +2019+, at least. + +| Target | Notes | +| -------------------------------| ----- | +| aarch64-apple-darwin | Build-only (GitHub Actions doesn't have a way to run the tests) +| aarch64-apple-ios | Build-only (GitHub Actions doesn't have a way to run the tests) +| aarch64-unknown-linux-gnu | Tested on 64-bit Linux using QEMU user emulation +| aarch64-unknown-linux-musl | Tested on 64-bit Linux using QEMU user emulation. [Needs more work; issue 713](https://github.com/briansmith/ring/issues/713) +| aarch64-linux-android | API level 21 (Android 5.0+); [Build-only; issue 486](https://github.com/briansmith/ring/issues/486) +| arm-unknown-linux-gnueabihf | Tested on 64-bit Linux using QEMU user emulation +| armv7-linux-androideabi | API level 18 (Android 4.3+); [Build-only; issue 838](https://github.com/briansmith/ring/issues/838) +| armv7-unknown-linux-musleabihf | Tested on 64-bit Linux using QEMU user emulation. [Needs more work; issue 713](https://github.com/briansmith/ring/issues/713) +| i686-pc-windows-msvc | Tested on 64-bit Windows Server 2019 Datacenter +| i686-unknown-linux-gnu | Tested on 64-bit Linux using multilib support +| i686-unknown-linux-musl | Tested on 64-bit Linux using multilib support. [Needs more work; issue 713](https://github.com/briansmith/ring/issues/713) +| x86_64-apple-darwin | +| x86_64-pc-windows-gnu | +| x86_64-pc-windows-msvc | Tested on 64-bit Windows Server 2019 Datacenter +| x86_64-unknown-linux-gnu | +| x86_64-unknown-linux-musl | [Needs more work; issue 713](https://github.com/briansmith/ring/issues/713) +| wasm32-unknown-unknown | Tested using wasm-bindgen-test-runner on Linux in Chrome and Firefox. License ------- diff --git a/appveyor.yml b/appveyor.yml deleted file mode 100644 index 97988f1f17..0000000000 --- a/appveyor.yml +++ /dev/null @@ -1,20 +0,0 @@ -version: 1.0.{build} -os: - - Visual Studio 2019 -clone_depth: 1 -configuration: - - Debug - - Release -platform: - - Win32 - - x64 -environment: - matrix: - - TOOLCHAIN_VERSION: 14.0 - RUST: stable - - TOOLCHAIN_VERSION: 14.0 - RUST: beta - - TOOLCHAIN_VERSION: 14.0 - RUST: nightly - -build_script: mk/appveyor.bat diff --git a/build.rs b/build.rs index d0a7433e71..c09539d0a7 100644 --- a/build.rs +++ b/build.rs @@ -19,24 +19,6 @@ // another for the concrete logging implementation). Instead we use `eprintln!` // to log everything to stderr. -#![forbid( - anonymous_parameters, - box_pointers, - missing_copy_implementations, - missing_debug_implementations, - missing_docs, - trivial_casts, - trivial_numeric_casts, - unsafe_code, - unstable_features, - unused_extern_crates, - unused_import_braces, - unused_qualifications, - unused_results, - variant_size_differences, - warnings -)] - // In the `pregenerate_asm_main()` case we don't want to access (Cargo) // environment variables at all, so avoid `use std::env` here. @@ -52,13 +34,14 @@ const X86_64: &str = "x86_64"; const AARCH64: &str = "aarch64"; const ARM: &str = "arm"; -#[cfg_attr(rustfmt, rustfmt_skip)] +#[rustfmt::skip] const RING_SRCS: &[(&[&str], &str)] = &[ (&[], "crypto/fipsmodule/aes/aes_nohw.c"), (&[], "crypto/fipsmodule/bn/montgomery.c"), (&[], "crypto/fipsmodule/bn/montgomery_inv.c"), (&[], "crypto/limbs/limbs.c"), (&[], "crypto/mem.c"), + (&[], "crypto/poly1305/poly1305.c"), (&[AARCH64, ARM, X86_64, X86], "crypto/crypto.c"), (&[AARCH64, ARM, X86_64, X86], "crypto/curve25519/curve25519.c"), @@ -75,7 +58,6 @@ const RING_SRCS: &[(&[&str], &str)] = &[ (&[X86], "crypto/chacha/asm/chacha-x86.pl"), (&[X86], "crypto/fipsmodule/ec/asm/ecp_nistz256-x86.pl"), (&[X86], "crypto/fipsmodule/modes/asm/ghash-x86.pl"), - (&[X86], "crypto/poly1305/asm/poly1305-x86.pl"), (&[X86_64], "crypto/fipsmodule/aes/asm/aesni-x86_64.pl"), (&[X86_64], "crypto/fipsmodule/aes/asm/vpaes-x86_64.pl"), @@ -85,7 +67,7 @@ const RING_SRCS: &[(&[&str], &str)] = &[ (&[X86_64], "crypto/fipsmodule/ec/asm/p256-x86_64-asm.pl"), (&[X86_64], "crypto/fipsmodule/modes/asm/aesni-gcm-x86_64.pl"), (&[X86_64], "crypto/fipsmodule/modes/asm/ghash-x86_64.pl"), - (&[X86_64], "crypto/poly1305/asm/poly1305-x86_64.pl"), + (&[X86_64], "crypto/poly1305/poly1305_vec.c"), (&[X86_64], SHA512_X86_64), (&[X86_64], "crypto/fipsmodule/rand/asm/rdrand-x86_64.pl"), @@ -99,7 +81,8 @@ const RING_SRCS: &[(&[&str], &str)] = &[ (&[ARM], "crypto/curve25519/asm/x25519-asm-arm.S"), (&[ARM], "crypto/fipsmodule/ec/asm/ecp_nistz256-armv4.pl"), (&[ARM], "crypto/fipsmodule/modes/asm/ghash-armv4.pl"), - (&[ARM], "crypto/poly1305/asm/poly1305-armv4.pl"), + (&[ARM], "crypto/poly1305/poly1305_arm.c"), + (&[ARM], "crypto/poly1305/poly1305_arm_asm.S"), (&[ARM], "crypto/fipsmodule/sha/asm/sha256-armv4.pl"), (&[ARM], "crypto/fipsmodule/sha/asm/sha512-armv4.pl"), @@ -108,7 +91,6 @@ const RING_SRCS: &[(&[&str], &str)] = &[ (&[AARCH64], "crypto/chacha/asm/chacha-armv8.pl"), (&[AARCH64], "crypto/fipsmodule/ec/asm/ecp_nistz256-armv8.pl"), (&[AARCH64], "crypto/fipsmodule/modes/asm/ghash-neon-armv8.pl"), - (&[AARCH64], "crypto/poly1305/asm/poly1305-armv8.pl"), (&[AARCH64], SHA512_ARMV8), ]; @@ -120,7 +102,7 @@ const SHA512_ARMV8: &str = "crypto/fipsmodule/sha/asm/sha512-armv8.pl"; const RING_TEST_SRCS: &[&str] = &[("crypto/constant_time_test.c")]; -#[cfg_attr(rustfmt, rustfmt_skip)] +#[rustfmt::skip] const RING_INCLUDES: &[&str] = &[ "crypto/curve25519/curve25519_tables.h", @@ -134,19 +116,20 @@ const RING_INCLUDES: &[&str] = "crypto/internal.h", "crypto/limbs/limbs.h", "crypto/limbs/limbs.inl", - "crypto/fipsmodule/modes/internal.h", + "crypto/poly1305/internal.h", "include/GFp/aes.h", "include/GFp/arm_arch.h", "include/GFp/base.h", "include/GFp/check.h", "include/GFp/cpu.h", "include/GFp/mem.h", + "include/GFp/poly1305.h", "include/GFp/type_check.h", "third_party/fiat/curve25519_32.h", "third_party/fiat/curve25519_64.h", ]; -#[cfg_attr(rustfmt, rustfmt_skip)] +#[rustfmt::skip] const RING_PERL_INCLUDES: &[&str] = &["crypto/perlasm/arm-xlate.pl", "crypto/perlasm/x86gas.pl", @@ -236,6 +219,7 @@ const ASM_TARGETS: &[(&str, Option<&str>, Option<&str>)] = &[ ("x86_64", Some(WINDOWS), Some("nasm")), ("x86_64", None, Some("elf")), ("aarch64", Some("ios"), Some("ios64")), + ("aarch64", Some("macos"), Some("ios64")), ("aarch64", None, Some("linux64")), ("x86", Some(WINDOWS), Some("win32n")), ("x86", Some("ios"), Some("macosx")), @@ -264,10 +248,6 @@ fn main() { fn ring_build_rs_main() { use std::env; - for (key, value) in env::vars() { - eprintln!("ENV {}={}", key, value); - } - let out_dir = env::var("OUT_DIR").unwrap(); let out_dir = PathBuf::from(out_dir); @@ -324,9 +304,8 @@ fn pregenerate_asm_main() { if target_os == Some(WINDOWS) { let srcs = asm_srcs(perlasm_src_dsts); for src in srcs { - let src_path = PathBuf::from(src); - let obj_path = obj_path(&pregenerated, &src_path, MSVC_OBJ_EXT); - run_command(yasm(&src_path, target_arch, &obj_path)); + let obj_path = obj_path(&pregenerated, &src, MSVC_OBJ_EXT); + run_command(nasm(&src, target_arch, &obj_path)); } } } @@ -374,7 +353,7 @@ fn build_c_code(target: &Target, pregenerated: PathBuf, out_dir: &Path) { .iter() .find(|entry| { let &(entry_arch, entry_os, _) = *entry; - entry_arch == &target.arch && is_none_or_equals(entry_os, &target.os) + entry_arch == target.arch && is_none_or_equals(entry_os, &target.os) }) .unwrap(); @@ -405,7 +384,7 @@ fn build_c_code(target: &Target, pregenerated: PathBuf, out_dir: &Path) { // For Windows we also pregenerate the object files for non-Git builds so // the user doesn't need to install the assembler. On other platforms we // assume the C compiler also assembles. - if use_pregenerated && &target.os == WINDOWS { + if use_pregenerated && target.os == WINDOWS { // The pregenerated object files always use ".obj" as the extension, // even when the C/C++ compiler outputs files with the ".o" extension. asm_srcs = asm_srcs @@ -462,8 +441,8 @@ fn build_library( ) { // Compile all the (dirty) source files into object files. let objs = additional_srcs - .into_iter() - .chain(srcs.into_iter()) + .iter() + .chain(srcs.iter()) .filter(|f| &target.env != "msvc" || f.extension().unwrap().to_str().unwrap() != "S") .map(|f| compile(f, target, warnings_are_errors, out_dir, includes_modified)) .collect::>(); @@ -487,7 +466,7 @@ fn build_library( let _ = c.flag("-Wl,-dead_strip"); } _ => { - let _ = c.flag("-Wl,--gc-sections".into()); + let _ = c.flag("-Wl,--gc-sections"); enable_lvi_hardening(&mut c); } } @@ -522,13 +501,13 @@ fn compile( if ext == "obj" { p.to_str().expect("Invalid path").into() } else { - let mut out_path = out_dir.clone().join(p.file_name().unwrap()); + let mut out_path = out_dir.join(p.file_name().unwrap()); assert!(out_path.set_extension(target.obj_ext)); if need_run(&p, &out_path, includes_modified) { - let cmd = if &target.os != WINDOWS || ext != "asm" { + let cmd = if target.os != WINDOWS || ext != "asm" { cc(p, ext, target, warnings_are_errors, &out_path) } else { - yasm(p, &target.arch, &out_path) + nasm(p, &target.arch, &out_path) }; run_command(cmd); @@ -538,7 +517,7 @@ fn compile( } fn obj_path(out_dir: &Path, src: &Path, obj_ext: &str) -> PathBuf { - let mut out_path = out_dir.clone().join(src.file_name().unwrap()); + let mut out_path = out_dir.join(src.file_name().unwrap()); assert!(out_path.set_extension(obj_ext)); out_path } @@ -550,6 +529,8 @@ fn cc( warnings_are_errors: bool, out_dir: &Path, ) -> Command { + let is_musl = target.env.starts_with("musl"); + let mut c = cc::Build::new(); let _ = c.include("include"); match ext { @@ -564,9 +545,9 @@ fn cc( for f in cpp_flags(target) { let _ = c.flag(&f); } - if &target.os != "none" - && &target.os != "redox" - && &target.os != "windows" + if target.os != "none" + && target.os != "redox" + && target.os != "windows" && target.arch != "wasm32" { let _ = c.flag("-fstack-protector"); @@ -597,8 +578,19 @@ fn cc( } } - if (target.arch.as_str(), target.os.as_str()) == ("wasm32", "unknown") { - let _ = c.flag("--no-standard-libraries"); + // Allow cross-compiling without a target sysroot for these targets. + // + // poly1305_vec.c requires which requires . + if (target.arch == "wasm32" && target.os == "unknown") + || (target.os == "linux" && is_musl && target.arch != "x86_64") + { + if let Ok(compiler) = c.try_get_compiler() { + // TODO: Expand this to non-clang compilers in 0.17.0 if practical. + if compiler.is_like_clang() { + let _ = c.flag("-nostdlibinc"); + let _ = c.define("GFp_NOSTDLIBINC", "1"); + } + } } if warnings_are_errors { @@ -609,7 +601,7 @@ fn cc( }; let _ = c.flag(flag); } - if &target.env == "musl" { + if is_musl { // Some platforms enable _FORTIFY_SOURCE by default, but musl // libc doesn't support it yet. See // http://wiki.musl-libc.org/wiki/Future_Ideas#Fortify @@ -630,21 +622,20 @@ fn cc( c } -fn yasm(file: &Path, arch: &str, out_file: &Path) -> Command { - let (oformat, machine) = match arch { - "x86_64" => ("--oformat=win64", "--machine=amd64"), - "x86" => ("--oformat=win32", "--machine=x86"), +fn nasm(file: &Path, arch: &str, out_file: &Path) -> Command { + let oformat = match arch { + "x86_64" => ("win64"), + "x86" => ("win32"), _ => panic!("unsupported arch: {}", arch), }; - let mut c = Command::new("yasm.exe"); + let mut c = Command::new("./target/tools/nasm"); let _ = c - .arg("-X") - .arg("vc") - .arg("--dformat=cv8") - .arg(oformat) - .arg(machine) .arg("-o") .arg(out_file.to_str().expect("Invalid path")) + .arg("-f") + .arg(oformat) + .arg("-Xgnu") + .arg("-gcv8") .arg(file); c } @@ -783,7 +774,7 @@ fn file_modified(path: &Path) -> SystemTime { } fn get_command(var: &str, default: &str) -> String { - std::env::var(var).unwrap_or(default.into()) + std::env::var(var).unwrap_or_else(|_| default.into()) } fn check_all_files_tracked() { @@ -828,11 +819,7 @@ where fn lvi_mitigation_not_supported(base_config: &cc::Build) -> bool { let feature_flag = base_config.is_flag_supported("-mlvi-hardening"); - match feature_flag { - Ok(false) => true, - Err(_) => true, - _ => false, - } + matches!(feature_flag, Ok(false) | Err(_)) } fn enable_lvi_hardening(c: &mut cc::Build) { diff --git a/crypto/.gitattributes b/crypto/.gitattributes deleted file mode 100644 index 15a5c58091..0000000000 --- a/crypto/.gitattributes +++ /dev/null @@ -1 +0,0 @@ -*.h linguist-language=C diff --git a/crypto/chacha/asm/chacha-armv4.pl b/crypto/chacha/asm/chacha-armv4.pl index 1d6f60aa69..90fa2c777b 100755 --- a/crypto/chacha/asm/chacha-armv4.pl +++ b/crypto/chacha/asm/chacha-armv4.pl @@ -197,6 +197,8 @@ sub ROUND { .Lone: .long 1,0,0,0 #if __ARM_MAX_ARCH__>=7 +.extern GFp_armcap_P +.hidden GFp_armcap_P .LOPENSSL_armcap: .word GFp_armcap_P-.LChaCha20_ctr32 #else @@ -1151,7 +1153,6 @@ sub NEONROUND { add sp,sp,#4*(16+3) ldmia sp!,{r4-r11,pc} .size ChaCha20_neon,.-ChaCha20_neon -.comm GFp_armcap_P,4,4 #endif ___ }}} diff --git a/crypto/chacha/asm/chacha-armv8.pl b/crypto/chacha/asm/chacha-armv8.pl index 4644cfd1a8..80ec882c57 100755 --- a/crypto/chacha/asm/chacha-armv8.pl +++ b/crypto/chacha/asm/chacha-armv8.pl @@ -123,6 +123,7 @@ sub ROUND { #include .extern GFp_armcap_P +.hidden GFp_armcap_P .section .rodata @@ -139,6 +140,7 @@ sub ROUND { .type GFp_ChaCha20_ctr32,%function .align 5 GFp_ChaCha20_ctr32: + AARCH64_VALID_CALL_TARGET cbz $len,.Labort #if __has_feature(hwaddress_sanitizer) && __clang_major__ >= 10 adrp @x[0],:pg_hi21_nc:GFp_armcap_P @@ -152,6 +154,7 @@ sub ROUND { b.ne ChaCha20_neon .Lshort: + AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-96]! add x29,sp,#0 @@ -272,6 +275,7 @@ sub ROUND { ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] ldp x29,x30,[sp],#96 + AARCH64_VALIDATE_LINK_REGISTER .Labort: ret @@ -328,6 +332,7 @@ sub ROUND { ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] ldp x29,x30,[sp],#96 + AARCH64_VALIDATE_LINK_REGISTER ret .size GFp_ChaCha20_ctr32,.-GFp_ChaCha20_ctr32 ___ @@ -373,6 +378,7 @@ sub NEONROUND { .type ChaCha20_neon,%function .align 5 ChaCha20_neon: + AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-96]! add x29,sp,#0 @@ -572,6 +578,7 @@ sub NEONROUND { ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] ldp x29,x30,[sp],#96 + AARCH64_VALIDATE_LINK_REGISTER ret .Ltail_neon: @@ -681,6 +688,7 @@ sub NEONROUND { ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] ldp x29,x30,[sp],#96 + AARCH64_VALIDATE_LINK_REGISTER ret .size ChaCha20_neon,.-ChaCha20_neon ___ @@ -693,6 +701,7 @@ sub NEONROUND { .type ChaCha20_512_neon,%function .align 5 ChaCha20_512_neon: + AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-96]! add x29,sp,#0 @@ -1112,6 +1121,7 @@ sub NEONROUND { ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] ldp x29,x30,[sp],#96 + AARCH64_VALIDATE_LINK_REGISTER ret .size ChaCha20_512_neon,.-ChaCha20_512_neon ___ diff --git a/crypto/chacha/asm/chacha-x86_64.pl b/crypto/chacha/asm/chacha-x86_64.pl index 98b4ae7106..c85bfa89cf 100755 --- a/crypto/chacha/asm/chacha-x86_64.pl +++ b/crypto/chacha/asm/chacha-x86_64.pl @@ -233,10 +233,6 @@ sub ROUND { # critical path is 24 cycles per round je .Lno_data mov GFp_ia32cap_P+4(%rip),%r10 ___ -$code.=<<___ if ($avx>2); - bt \$48,%r10 # check for AVX512F - jc .LChaCha20_avx512 -___ $code.=<<___; test \$`1<<(41-32)`,%r10d jnz .LChaCha20_ssse3 @@ -1807,733 +1803,7 @@ sub AVX2_lane_ROUND { } ######################################################################## -# AVX512 code paths -if ($avx>2) { -# This one handles shorter inputs... - -my ($a,$b,$c,$d, $a_,$b_,$c_,$d_,$fourz) = map("%zmm$_",(0..3,16..20)); -my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7)); - -sub AVX512ROUND { # critical path is 14 "SIMD ticks" per round - &vpaddd ($a,$a,$b); - &vpxord ($d,$d,$a); - &vprold ($d,$d,16); - - &vpaddd ($c,$c,$d); - &vpxord ($b,$b,$c); - &vprold ($b,$b,12); - - &vpaddd ($a,$a,$b); - &vpxord ($d,$d,$a); - &vprold ($d,$d,8); - - &vpaddd ($c,$c,$d); - &vpxord ($b,$b,$c); - &vprold ($b,$b,7); -} - -my $xframe = $win64 ? 32+8 : 8; - -$code.=<<___; -.type ChaCha20_avx512,\@function,5 -.align 32 -ChaCha20_avx512: -.LChaCha20_avx512: -.cfi_startproc - mov %rsp,%r9 # frame pointer -.cfi_def_cfa_register r9 - cmp \$512,$len - ja .LChaCha20_16x - - sub \$64+$xframe,%rsp -___ -$code.=<<___ if ($win64); - movaps %xmm6,-0x28(%r9) - movaps %xmm7,-0x18(%r9) -.Lavx512_body: -___ -$code.=<<___; - vbroadcasti32x4 .Lsigma(%rip),$a - vbroadcasti32x4 ($key),$b - vbroadcasti32x4 16($key),$c - vbroadcasti32x4 ($counter),$d - - vmovdqa32 $a,$a_ - vmovdqa32 $b,$b_ - vmovdqa32 $c,$c_ - vpaddd .Lzeroz(%rip),$d,$d - vmovdqa32 .Lfourz(%rip),$fourz - mov \$10,$counter # reuse $counter - vmovdqa32 $d,$d_ - jmp .Loop_avx512 - -.align 16 -.Loop_outer_avx512: - vmovdqa32 $a_,$a - vmovdqa32 $b_,$b - vmovdqa32 $c_,$c - vpaddd $fourz,$d_,$d - mov \$10,$counter - vmovdqa32 $d,$d_ - jmp .Loop_avx512 - -.align 32 -.Loop_avx512: -___ - &AVX512ROUND(); - &vpshufd ($c,$c,0b01001110); - &vpshufd ($b,$b,0b00111001); - &vpshufd ($d,$d,0b10010011); - - &AVX512ROUND(); - &vpshufd ($c,$c,0b01001110); - &vpshufd ($b,$b,0b10010011); - &vpshufd ($d,$d,0b00111001); - - &dec ($counter); - &jnz (".Loop_avx512"); - -$code.=<<___; - vpaddd $a_,$a,$a - vpaddd $b_,$b,$b - vpaddd $c_,$c,$c - vpaddd $d_,$d,$d - - sub \$64,$len - jb .Ltail64_avx512 - - vpxor 0x00($inp),%x#$a,$t0 # xor with input - vpxor 0x10($inp),%x#$b,$t1 - vpxor 0x20($inp),%x#$c,$t2 - vpxor 0x30($inp),%x#$d,$t3 - lea 0x40($inp),$inp # inp+=64 - - vmovdqu $t0,0x00($out) # write output - vmovdqu $t1,0x10($out) - vmovdqu $t2,0x20($out) - vmovdqu $t3,0x30($out) - lea 0x40($out),$out # out+=64 - - jz .Ldone_avx512 - - vextracti32x4 \$1,$a,$t0 - vextracti32x4 \$1,$b,$t1 - vextracti32x4 \$1,$c,$t2 - vextracti32x4 \$1,$d,$t3 - - sub \$64,$len - jb .Ltail_avx512 - - vpxor 0x00($inp),$t0,$t0 # xor with input - vpxor 0x10($inp),$t1,$t1 - vpxor 0x20($inp),$t2,$t2 - vpxor 0x30($inp),$t3,$t3 - lea 0x40($inp),$inp # inp+=64 - - vmovdqu $t0,0x00($out) # write output - vmovdqu $t1,0x10($out) - vmovdqu $t2,0x20($out) - vmovdqu $t3,0x30($out) - lea 0x40($out),$out # out+=64 - - jz .Ldone_avx512 - - vextracti32x4 \$2,$a,$t0 - vextracti32x4 \$2,$b,$t1 - vextracti32x4 \$2,$c,$t2 - vextracti32x4 \$2,$d,$t3 - - sub \$64,$len - jb .Ltail_avx512 - - vpxor 0x00($inp),$t0,$t0 # xor with input - vpxor 0x10($inp),$t1,$t1 - vpxor 0x20($inp),$t2,$t2 - vpxor 0x30($inp),$t3,$t3 - lea 0x40($inp),$inp # inp+=64 - - vmovdqu $t0,0x00($out) # write output - vmovdqu $t1,0x10($out) - vmovdqu $t2,0x20($out) - vmovdqu $t3,0x30($out) - lea 0x40($out),$out # out+=64 - - jz .Ldone_avx512 - - vextracti32x4 \$3,$a,$t0 - vextracti32x4 \$3,$b,$t1 - vextracti32x4 \$3,$c,$t2 - vextracti32x4 \$3,$d,$t3 - - sub \$64,$len - jb .Ltail_avx512 - - vpxor 0x00($inp),$t0,$t0 # xor with input - vpxor 0x10($inp),$t1,$t1 - vpxor 0x20($inp),$t2,$t2 - vpxor 0x30($inp),$t3,$t3 - lea 0x40($inp),$inp # inp+=64 - - vmovdqu $t0,0x00($out) # write output - vmovdqu $t1,0x10($out) - vmovdqu $t2,0x20($out) - vmovdqu $t3,0x30($out) - lea 0x40($out),$out # out+=64 - - jnz .Loop_outer_avx512 - - jmp .Ldone_avx512 - -.align 16 -.Ltail64_avx512: - vmovdqa %x#$a,0x00(%rsp) - vmovdqa %x#$b,0x10(%rsp) - vmovdqa %x#$c,0x20(%rsp) - vmovdqa %x#$d,0x30(%rsp) - add \$64,$len - jmp .Loop_tail_avx512 - -.align 16 -.Ltail_avx512: - vmovdqa $t0,0x00(%rsp) - vmovdqa $t1,0x10(%rsp) - vmovdqa $t2,0x20(%rsp) - vmovdqa $t3,0x30(%rsp) - add \$64,$len - -.Loop_tail_avx512: - movzb ($inp,$counter),%eax - movzb (%rsp,$counter),%ecx - lea 1($counter),$counter - xor %ecx,%eax - mov %al,-1($out,$counter) - dec $len - jnz .Loop_tail_avx512 - - vmovdqa32 $a_,0x00(%rsp) - -.Ldone_avx512: - vzeroall -___ -$code.=<<___ if ($win64); - movaps -0x28(%r9),%xmm6 - movaps -0x18(%r9),%xmm7 -___ -$code.=<<___; - lea (%r9),%rsp -.cfi_def_cfa_register rsp -.Lavx512_epilogue: - ret -.cfi_endproc -.size ChaCha20_avx512,.-ChaCha20_avx512 -___ -} -if ($avx>2) { -# This one handles longer inputs... - -my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, - $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3)=map("%zmm$_",(0..15)); -my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, - $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3); -my @key=map("%zmm$_",(16..31)); -my ($xt0,$xt1,$xt2,$xt3)=@key[0..3]; - -sub AVX512_lane_ROUND { -my ($a0,$b0,$c0,$d0)=@_; -my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); -my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); -my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); -my @x=map("\"$_\"",@xx); - - ( - "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1 - "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2 - "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3 - "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4 - "&vpxord (@x[$d0],@x[$d0],@x[$a0])", - "&vpxord (@x[$d1],@x[$d1],@x[$a1])", - "&vpxord (@x[$d2],@x[$d2],@x[$a2])", - "&vpxord (@x[$d3],@x[$d3],@x[$a3])", - "&vprold (@x[$d0],@x[$d0],16)", - "&vprold (@x[$d1],@x[$d1],16)", - "&vprold (@x[$d2],@x[$d2],16)", - "&vprold (@x[$d3],@x[$d3],16)", - - "&vpaddd (@x[$c0],@x[$c0],@x[$d0])", - "&vpaddd (@x[$c1],@x[$c1],@x[$d1])", - "&vpaddd (@x[$c2],@x[$c2],@x[$d2])", - "&vpaddd (@x[$c3],@x[$c3],@x[$d3])", - "&vpxord (@x[$b0],@x[$b0],@x[$c0])", - "&vpxord (@x[$b1],@x[$b1],@x[$c1])", - "&vpxord (@x[$b2],@x[$b2],@x[$c2])", - "&vpxord (@x[$b3],@x[$b3],@x[$c3])", - "&vprold (@x[$b0],@x[$b0],12)", - "&vprold (@x[$b1],@x[$b1],12)", - "&vprold (@x[$b2],@x[$b2],12)", - "&vprold (@x[$b3],@x[$b3],12)", - - "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", - "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", - "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", - "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", - "&vpxord (@x[$d0],@x[$d0],@x[$a0])", - "&vpxord (@x[$d1],@x[$d1],@x[$a1])", - "&vpxord (@x[$d2],@x[$d2],@x[$a2])", - "&vpxord (@x[$d3],@x[$d3],@x[$a3])", - "&vprold (@x[$d0],@x[$d0],8)", - "&vprold (@x[$d1],@x[$d1],8)", - "&vprold (@x[$d2],@x[$d2],8)", - "&vprold (@x[$d3],@x[$d3],8)", - - "&vpaddd (@x[$c0],@x[$c0],@x[$d0])", - "&vpaddd (@x[$c1],@x[$c1],@x[$d1])", - "&vpaddd (@x[$c2],@x[$c2],@x[$d2])", - "&vpaddd (@x[$c3],@x[$c3],@x[$d3])", - "&vpxord (@x[$b0],@x[$b0],@x[$c0])", - "&vpxord (@x[$b1],@x[$b1],@x[$c1])", - "&vpxord (@x[$b2],@x[$b2],@x[$c2])", - "&vpxord (@x[$b3],@x[$b3],@x[$c3])", - "&vprold (@x[$b0],@x[$b0],7)", - "&vprold (@x[$b1],@x[$b1],7)", - "&vprold (@x[$b2],@x[$b2],7)", - "&vprold (@x[$b3],@x[$b3],7)" - ); -} - -my $xframe = $win64 ? 0xa8 : 8; - -$code.=<<___; -.type ChaCha20_16x,\@function,5 -.align 32 -ChaCha20_16x: -.LChaCha20_16x: -.cfi_startproc - mov %rsp,%r9 # frame register -.cfi_def_cfa_register r9 - sub \$64+$xframe,%rsp - and \$-64,%rsp -___ -$code.=<<___ if ($win64); - movaps %xmm6,-0xa8(%r9) - movaps %xmm7,-0x98(%r9) - movaps %xmm8,-0x88(%r9) - movaps %xmm9,-0x78(%r9) - movaps %xmm10,-0x68(%r9) - movaps %xmm11,-0x58(%r9) - movaps %xmm12,-0x48(%r9) - movaps %xmm13,-0x38(%r9) - movaps %xmm14,-0x28(%r9) - movaps %xmm15,-0x18(%r9) -.L16x_body: -___ -$code.=<<___; - vzeroupper - - lea .Lsigma(%rip),%r10 - vbroadcasti32x4 (%r10),$xa3 # key[0] - vbroadcasti32x4 ($key),$xb3 # key[1] - vbroadcasti32x4 16($key),$xc3 # key[2] - vbroadcasti32x4 ($counter),$xd3 # key[3] - - vpshufd \$0x00,$xa3,$xa0 # smash key by lanes... - vpshufd \$0x55,$xa3,$xa1 - vpshufd \$0xaa,$xa3,$xa2 - vpshufd \$0xff,$xa3,$xa3 - vmovdqa64 $xa0,@key[0] - vmovdqa64 $xa1,@key[1] - vmovdqa64 $xa2,@key[2] - vmovdqa64 $xa3,@key[3] - - vpshufd \$0x00,$xb3,$xb0 - vpshufd \$0x55,$xb3,$xb1 - vpshufd \$0xaa,$xb3,$xb2 - vpshufd \$0xff,$xb3,$xb3 - vmovdqa64 $xb0,@key[4] - vmovdqa64 $xb1,@key[5] - vmovdqa64 $xb2,@key[6] - vmovdqa64 $xb3,@key[7] - - vpshufd \$0x00,$xc3,$xc0 - vpshufd \$0x55,$xc3,$xc1 - vpshufd \$0xaa,$xc3,$xc2 - vpshufd \$0xff,$xc3,$xc3 - vmovdqa64 $xc0,@key[8] - vmovdqa64 $xc1,@key[9] - vmovdqa64 $xc2,@key[10] - vmovdqa64 $xc3,@key[11] - - vpshufd \$0x00,$xd3,$xd0 - vpshufd \$0x55,$xd3,$xd1 - vpshufd \$0xaa,$xd3,$xd2 - vpshufd \$0xff,$xd3,$xd3 - vpaddd .Lincz(%rip),$xd0,$xd0 # don't save counters yet - vmovdqa64 $xd0,@key[12] - vmovdqa64 $xd1,@key[13] - vmovdqa64 $xd2,@key[14] - vmovdqa64 $xd3,@key[15] - - mov \$10,%eax - jmp .Loop16x - -.align 32 -.Loop_outer16x: - vpbroadcastd 0(%r10),$xa0 # reload key - vpbroadcastd 4(%r10),$xa1 - vpbroadcastd 8(%r10),$xa2 - vpbroadcastd 12(%r10),$xa3 - vpaddd .Lsixteen(%rip),@key[12],@key[12] # next SIMD counters - vmovdqa64 @key[4],$xb0 - vmovdqa64 @key[5],$xb1 - vmovdqa64 @key[6],$xb2 - vmovdqa64 @key[7],$xb3 - vmovdqa64 @key[8],$xc0 - vmovdqa64 @key[9],$xc1 - vmovdqa64 @key[10],$xc2 - vmovdqa64 @key[11],$xc3 - vmovdqa64 @key[12],$xd0 - vmovdqa64 @key[13],$xd1 - vmovdqa64 @key[14],$xd2 - vmovdqa64 @key[15],$xd3 - - vmovdqa64 $xa0,@key[0] - vmovdqa64 $xa1,@key[1] - vmovdqa64 $xa2,@key[2] - vmovdqa64 $xa3,@key[3] - - mov \$10,%eax - jmp .Loop16x - -.align 32 -.Loop16x: -___ - foreach (&AVX512_lane_ROUND(0, 4, 8,12)) { eval; } - foreach (&AVX512_lane_ROUND(0, 5,10,15)) { eval; } -$code.=<<___; - dec %eax - jnz .Loop16x - - vpaddd @key[0],$xa0,$xa0 # accumulate key - vpaddd @key[1],$xa1,$xa1 - vpaddd @key[2],$xa2,$xa2 - vpaddd @key[3],$xa3,$xa3 - - vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data - vpunpckldq $xa3,$xa2,$xt3 - vpunpckhdq $xa1,$xa0,$xa0 - vpunpckhdq $xa3,$xa2,$xa2 - vpunpcklqdq $xt3,$xt2,$xa1 # "a0" - vpunpckhqdq $xt3,$xt2,$xt2 # "a1" - vpunpcklqdq $xa2,$xa0,$xa3 # "a2" - vpunpckhqdq $xa2,$xa0,$xa0 # "a3" -___ - ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2); -$code.=<<___; - vpaddd @key[4],$xb0,$xb0 - vpaddd @key[5],$xb1,$xb1 - vpaddd @key[6],$xb2,$xb2 - vpaddd @key[7],$xb3,$xb3 - - vpunpckldq $xb1,$xb0,$xt2 - vpunpckldq $xb3,$xb2,$xt3 - vpunpckhdq $xb1,$xb0,$xb0 - vpunpckhdq $xb3,$xb2,$xb2 - vpunpcklqdq $xt3,$xt2,$xb1 # "b0" - vpunpckhqdq $xt3,$xt2,$xt2 # "b1" - vpunpcklqdq $xb2,$xb0,$xb3 # "b2" - vpunpckhqdq $xb2,$xb0,$xb0 # "b3" -___ - ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2); -$code.=<<___; - vshufi32x4 \$0x44,$xb0,$xa0,$xt3 # "de-interlace" further - vshufi32x4 \$0xee,$xb0,$xa0,$xb0 - vshufi32x4 \$0x44,$xb1,$xa1,$xa0 - vshufi32x4 \$0xee,$xb1,$xa1,$xb1 - vshufi32x4 \$0x44,$xb2,$xa2,$xa1 - vshufi32x4 \$0xee,$xb2,$xa2,$xb2 - vshufi32x4 \$0x44,$xb3,$xa3,$xa2 - vshufi32x4 \$0xee,$xb3,$xa3,$xb3 -___ - ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3); -$code.=<<___; - vpaddd @key[8],$xc0,$xc0 - vpaddd @key[9],$xc1,$xc1 - vpaddd @key[10],$xc2,$xc2 - vpaddd @key[11],$xc3,$xc3 - - vpunpckldq $xc1,$xc0,$xt2 - vpunpckldq $xc3,$xc2,$xt3 - vpunpckhdq $xc1,$xc0,$xc0 - vpunpckhdq $xc3,$xc2,$xc2 - vpunpcklqdq $xt3,$xt2,$xc1 # "c0" - vpunpckhqdq $xt3,$xt2,$xt2 # "c1" - vpunpcklqdq $xc2,$xc0,$xc3 # "c2" - vpunpckhqdq $xc2,$xc0,$xc0 # "c3" -___ - ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2); -$code.=<<___; - vpaddd @key[12],$xd0,$xd0 - vpaddd @key[13],$xd1,$xd1 - vpaddd @key[14],$xd2,$xd2 - vpaddd @key[15],$xd3,$xd3 - - vpunpckldq $xd1,$xd0,$xt2 - vpunpckldq $xd3,$xd2,$xt3 - vpunpckhdq $xd1,$xd0,$xd0 - vpunpckhdq $xd3,$xd2,$xd2 - vpunpcklqdq $xt3,$xt2,$xd1 # "d0" - vpunpckhqdq $xt3,$xt2,$xt2 # "d1" - vpunpcklqdq $xd2,$xd0,$xd3 # "d2" - vpunpckhqdq $xd2,$xd0,$xd0 # "d3" -___ - ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2); -$code.=<<___; - vshufi32x4 \$0x44,$xd0,$xc0,$xt3 # "de-interlace" further - vshufi32x4 \$0xee,$xd0,$xc0,$xd0 - vshufi32x4 \$0x44,$xd1,$xc1,$xc0 - vshufi32x4 \$0xee,$xd1,$xc1,$xd1 - vshufi32x4 \$0x44,$xd2,$xc2,$xc1 - vshufi32x4 \$0xee,$xd2,$xc2,$xd2 - vshufi32x4 \$0x44,$xd3,$xc3,$xc2 - vshufi32x4 \$0xee,$xd3,$xc3,$xd3 -___ - ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3); -$code.=<<___; - vshufi32x4 \$0x88,$xc0,$xa0,$xt0 # "de-interlace" further - vshufi32x4 \$0xdd,$xc0,$xa0,$xa0 - vshufi32x4 \$0x88,$xd0,$xb0,$xc0 - vshufi32x4 \$0xdd,$xd0,$xb0,$xd0 - vshufi32x4 \$0x88,$xc1,$xa1,$xt1 - vshufi32x4 \$0xdd,$xc1,$xa1,$xa1 - vshufi32x4 \$0x88,$xd1,$xb1,$xc1 - vshufi32x4 \$0xdd,$xd1,$xb1,$xd1 - vshufi32x4 \$0x88,$xc2,$xa2,$xt2 - vshufi32x4 \$0xdd,$xc2,$xa2,$xa2 - vshufi32x4 \$0x88,$xd2,$xb2,$xc2 - vshufi32x4 \$0xdd,$xd2,$xb2,$xd2 - vshufi32x4 \$0x88,$xc3,$xa3,$xt3 - vshufi32x4 \$0xdd,$xc3,$xa3,$xa3 - vshufi32x4 \$0x88,$xd3,$xb3,$xc3 - vshufi32x4 \$0xdd,$xd3,$xb3,$xd3 -___ - ($xa0,$xa1,$xa2,$xa3,$xb0,$xb1,$xb2,$xb3)= - ($xt0,$xt1,$xt2,$xt3,$xa0,$xa1,$xa2,$xa3); - - ($xa0,$xb0,$xc0,$xd0, $xa1,$xb1,$xc1,$xd1, - $xa2,$xb2,$xc2,$xd2, $xa3,$xb3,$xc3,$xd3) = - ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, - $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3); -$code.=<<___; - cmp \$64*16,$len - jb .Ltail16x - - vpxord 0x00($inp),$xa0,$xa0 # xor with input - vpxord 0x40($inp),$xb0,$xb0 - vpxord 0x80($inp),$xc0,$xc0 - vpxord 0xc0($inp),$xd0,$xd0 - vmovdqu32 $xa0,0x00($out) - vmovdqu32 $xb0,0x40($out) - vmovdqu32 $xc0,0x80($out) - vmovdqu32 $xd0,0xc0($out) - - vpxord 0x100($inp),$xa1,$xa1 - vpxord 0x140($inp),$xb1,$xb1 - vpxord 0x180($inp),$xc1,$xc1 - vpxord 0x1c0($inp),$xd1,$xd1 - vmovdqu32 $xa1,0x100($out) - vmovdqu32 $xb1,0x140($out) - vmovdqu32 $xc1,0x180($out) - vmovdqu32 $xd1,0x1c0($out) - - vpxord 0x200($inp),$xa2,$xa2 - vpxord 0x240($inp),$xb2,$xb2 - vpxord 0x280($inp),$xc2,$xc2 - vpxord 0x2c0($inp),$xd2,$xd2 - vmovdqu32 $xa2,0x200($out) - vmovdqu32 $xb2,0x240($out) - vmovdqu32 $xc2,0x280($out) - vmovdqu32 $xd2,0x2c0($out) - - vpxord 0x300($inp),$xa3,$xa3 - vpxord 0x340($inp),$xb3,$xb3 - vpxord 0x380($inp),$xc3,$xc3 - vpxord 0x3c0($inp),$xd3,$xd3 - lea 0x400($inp),$inp - vmovdqu32 $xa3,0x300($out) - vmovdqu32 $xb3,0x340($out) - vmovdqu32 $xc3,0x380($out) - vmovdqu32 $xd3,0x3c0($out) - lea 0x400($out),$out - - sub \$64*16,$len - jnz .Loop_outer16x - - jmp .Ldone16x - -.align 32 -.Ltail16x: - xor %r10,%r10 - sub $inp,$out - cmp \$64*1,$len - jb .Less_than_64_16x - vpxord ($inp),$xa0,$xa0 # xor with input - vmovdqu32 $xa0,($out,$inp) - je .Ldone16x - vmovdqa32 $xb0,$xa0 - lea 64($inp),$inp - - cmp \$64*2,$len - jb .Less_than_64_16x - vpxord ($inp),$xb0,$xb0 - vmovdqu32 $xb0,($out,$inp) - je .Ldone16x - vmovdqa32 $xc0,$xa0 - lea 64($inp),$inp - - cmp \$64*3,$len - jb .Less_than_64_16x - vpxord ($inp),$xc0,$xc0 - vmovdqu32 $xc0,($out,$inp) - je .Ldone16x - vmovdqa32 $xd0,$xa0 - lea 64($inp),$inp - - cmp \$64*4,$len - jb .Less_than_64_16x - vpxord ($inp),$xd0,$xd0 - vmovdqu32 $xd0,($out,$inp) - je .Ldone16x - vmovdqa32 $xa1,$xa0 - lea 64($inp),$inp - - cmp \$64*5,$len - jb .Less_than_64_16x - vpxord ($inp),$xa1,$xa1 - vmovdqu32 $xa1,($out,$inp) - je .Ldone16x - vmovdqa32 $xb1,$xa0 - lea 64($inp),$inp - - cmp \$64*6,$len - jb .Less_than_64_16x - vpxord ($inp),$xb1,$xb1 - vmovdqu32 $xb1,($out,$inp) - je .Ldone16x - vmovdqa32 $xc1,$xa0 - lea 64($inp),$inp - - cmp \$64*7,$len - jb .Less_than_64_16x - vpxord ($inp),$xc1,$xc1 - vmovdqu32 $xc1,($out,$inp) - je .Ldone16x - vmovdqa32 $xd1,$xa0 - lea 64($inp),$inp - - cmp \$64*8,$len - jb .Less_than_64_16x - vpxord ($inp),$xd1,$xd1 - vmovdqu32 $xd1,($out,$inp) - je .Ldone16x - vmovdqa32 $xa2,$xa0 - lea 64($inp),$inp - - cmp \$64*9,$len - jb .Less_than_64_16x - vpxord ($inp),$xa2,$xa2 - vmovdqu32 $xa2,($out,$inp) - je .Ldone16x - vmovdqa32 $xb2,$xa0 - lea 64($inp),$inp - - cmp \$64*10,$len - jb .Less_than_64_16x - vpxord ($inp),$xb2,$xb2 - vmovdqu32 $xb2,($out,$inp) - je .Ldone16x - vmovdqa32 $xc2,$xa0 - lea 64($inp),$inp - - cmp \$64*11,$len - jb .Less_than_64_16x - vpxord ($inp),$xc2,$xc2 - vmovdqu32 $xc2,($out,$inp) - je .Ldone16x - vmovdqa32 $xd2,$xa0 - lea 64($inp),$inp - - cmp \$64*12,$len - jb .Less_than_64_16x - vpxord ($inp),$xd2,$xd2 - vmovdqu32 $xd2,($out,$inp) - je .Ldone16x - vmovdqa32 $xa3,$xa0 - lea 64($inp),$inp - - cmp \$64*13,$len - jb .Less_than_64_16x - vpxord ($inp),$xa3,$xa3 - vmovdqu32 $xa3,($out,$inp) - je .Ldone16x - vmovdqa32 $xb3,$xa0 - lea 64($inp),$inp - - cmp \$64*14,$len - jb .Less_than_64_16x - vpxord ($inp),$xb3,$xb3 - vmovdqu32 $xb3,($out,$inp) - je .Ldone16x - vmovdqa32 $xc3,$xa0 - lea 64($inp),$inp - - cmp \$64*15,$len - jb .Less_than_64_16x - vpxord ($inp),$xc3,$xc3 - vmovdqu32 $xc3,($out,$inp) - je .Ldone16x - vmovdqa32 $xd3,$xa0 - lea 64($inp),$inp - -.Less_than_64_16x: - vmovdqa32 $xa0,0x00(%rsp) - lea ($out,$inp),$out - and \$63,$len - -.Loop_tail16x: - movzb ($inp,%r10),%eax - movzb (%rsp,%r10),%ecx - lea 1(%r10),%r10 - xor %ecx,%eax - mov %al,-1($out,%r10) - dec $len - jnz .Loop_tail16x - - vpxord $xa0,$xa0,$xa0 - vmovdqa32 $xa0,0(%rsp) - -.Ldone16x: - vzeroall -___ -$code.=<<___ if ($win64); - movaps -0xa8(%r9),%xmm6 - movaps -0x98(%r9),%xmm7 - movaps -0x88(%r9),%xmm8 - movaps -0x78(%r9),%xmm9 - movaps -0x68(%r9),%xmm10 - movaps -0x58(%r9),%xmm11 - movaps -0x48(%r9),%xmm12 - movaps -0x38(%r9),%xmm13 - movaps -0x28(%r9),%xmm14 - movaps -0x18(%r9),%xmm15 -___ -$code.=<<___; - lea (%r9),%rsp -.cfi_def_cfa_register rsp -.L16x_epilogue: - ret -.cfi_endproc -.size ChaCha20_16x,.-ChaCha20_16x -___ -} +# AVX512 code paths were removed # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, # CONTEXT *context,DISPATCHER_CONTEXT *disp) @@ -2729,15 +1999,6 @@ sub AVX512_lane_ROUND { .rva .LSEH_end_ChaCha20_8x .rva .LSEH_info_ChaCha20_8x ___ -$code.=<<___ if ($avx>2); - .rva .LSEH_begin_ChaCha20_avx512 - .rva .LSEH_end_ChaCha20_avx512 - .rva .LSEH_info_ChaCha20_avx512 - - .rva .LSEH_begin_ChaCha20_16x - .rva .LSEH_end_ChaCha20_16x - .rva .LSEH_info_ChaCha20_16x -___ $code.=<<___; .section .xdata .align 8 @@ -2761,17 +2022,6 @@ sub AVX512_lane_ROUND { .rva full_handler .rva .L8x_body,.L8x_epilogue # HandlerData[] ___ -$code.=<<___ if ($avx>2); -.LSEH_info_ChaCha20_avx512: - .byte 9,0,0,0 - .rva ssse3_handler - .rva .Lavx512_body,.Lavx512_epilogue # HandlerData[] - -.LSEH_info_ChaCha20_16x: - .byte 9,0,0,0 - .rva full_handler - .rva .L16x_body,.L16x_epilogue # HandlerData[] -___ } foreach (split("\n",$code)) { diff --git a/crypto/crypto.c b/crypto/crypto.c index 06000a856a..8a3d06675b 100644 --- a/crypto/crypto.c +++ b/crypto/crypto.c @@ -35,10 +35,4 @@ // initialising it to zero, it becomes a "data symbol", which isn't so // affected. HIDDEN uint32_t GFp_ia32cap_P[4] = {0}; -#elif defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64) - -#include - -HIDDEN uint32_t GFp_armcap_P = 0; - #endif diff --git a/crypto/curve25519/curve25519.c b/crypto/curve25519/curve25519.c index b4198996e8..30afff0eda 100644 --- a/crypto/curve25519/curve25519.c +++ b/crypto/curve25519/curve25519.c @@ -159,7 +159,7 @@ static void fe_frombytes_strict(fe *h, const uint8_t s[32]) { static void fe_frombytes(fe *h, const uint8_t s[32]) { uint8_t s_copy[32]; - bytes_copy(s_copy, s, 32); + GFp_memcpy(s_copy, s, 32); s_copy[31] &= 0x7f; fe_frombytes_strict(h, s_copy); } @@ -171,21 +171,21 @@ static void fe_tobytes(uint8_t s[32], const fe *f) { // h = 0 static void fe_0(fe *h) { - fe_limbs_zero(h->v); + GFp_memset(h, 0, sizeof(fe)); } static void fe_loose_0(fe_loose *h) { - fe_limbs_zero(h->v); + GFp_memset(h, 0, sizeof(fe_loose)); } // h = 1 static void fe_1(fe *h) { - fe_0(h); + GFp_memset(h, 0, sizeof(fe)); h->v[0] = 1; } static void fe_loose_1(fe_loose *h) { - fe_loose_0(h); + GFp_memset(h, 0, sizeof(fe_loose)); h->v[0] = 1; } @@ -1782,7 +1782,7 @@ void GFp_x25519_scalar_mult_generic_masked(uint8_t out[32], fe_loose x2l, z2l, x3l, tmp0l, tmp1l; uint8_t e[32]; - bytes_copy(e, scalar_masked, 32); + GFp_memcpy(e, scalar_masked, 32); // The following implementation was transcribed to Coq and proven to // correspond to unary scalar multiplication in affine coordinates given that // x1 != 0 is the x coordinate of some point on the curve. It was also checked @@ -1856,7 +1856,7 @@ void GFp_x25519_scalar_mult_generic_masked(uint8_t out[32], void GFp_x25519_public_from_private_generic_masked(uint8_t out_public_value[32], const uint8_t private_key_masked[32]) { uint8_t e[32]; - bytes_copy(e, private_key_masked, 32); + GFp_memcpy(e, private_key_masked, 32); ge_p3 A; GFp_x25519_ge_scalarmult_base(&A, e); diff --git a/crypto/curve25519/internal.h b/crypto/curve25519/internal.h index 5f87f92003..60f2f615b4 100644 --- a/crypto/curve25519/internal.h +++ b/crypto/curve25519/internal.h @@ -65,12 +65,6 @@ static inline void fe_limbs_copy(fe_limb_t r[], const fe_limb_t a[]) { } } -static inline void fe_limbs_zero(fe_limb_t r[]) { - for (size_t i = 0; i < FE_NUM_LIMBS; ++i) { - r[i] = 0; - } -} - // ge means group element. // // Here the group is the set of pairs (x,y) of field elements (see fe.h) diff --git a/crypto/fipsmodule/.gitattributes b/crypto/fipsmodule/.gitattributes deleted file mode 100644 index 80928d6041..0000000000 --- a/crypto/fipsmodule/.gitattributes +++ /dev/null @@ -1 +0,0 @@ -*.inl linguist-language=C diff --git a/crypto/fipsmodule/aes/aes_nohw.c b/crypto/fipsmodule/aes/aes_nohw.c index 4284ea7ec6..19b019e73f 100644 --- a/crypto/fipsmodule/aes/aes_nohw.c +++ b/crypto/fipsmodule/aes/aes_nohw.c @@ -14,13 +14,6 @@ #include -#if !defined(__wasm__) -#include -#else -void *memcpy(void *, const void*, size_t); -void *memset(void *, int, size_t); -#endif - #include "../../internal.h" #if defined(OPENSSL_SSE2) @@ -353,7 +346,7 @@ static inline uint8_t lo(uint32_t a) { static inline void aes_nohw_compact_block(aes_word_t out[AES_NOHW_BLOCK_WORDS], const uint8_t in[16]) { - memcpy(out, in, 16); + GFp_memcpy(out, in, 16); #if defined(OPENSSL_SSE2) // No conversions needed. #elif defined(OPENSSL_64_BIT) @@ -381,7 +374,7 @@ static inline void aes_nohw_compact_block(aes_word_t out[AES_NOHW_BLOCK_WORDS], static inline void aes_nohw_uncompact_block( uint8_t out[16], const aes_word_t in[AES_NOHW_BLOCK_WORDS]) { #if defined(OPENSSL_SSE2) - memcpy(out, in, 16); // No conversions needed. + GFp_memcpy(out, in, 16); // No conversions needed. #elif defined(OPENSSL_64_BIT) uint64_t a0 = in[0]; uint64_t a1 = in[1]; @@ -389,8 +382,8 @@ static inline void aes_nohw_uncompact_block( aes_nohw_uncompact_word((a0 & UINT64_C(0x00000000ffffffff)) | (a1 << 32)); uint64_t b1 = aes_nohw_uncompact_word((a1 & UINT64_C(0xffffffff00000000)) | (a0 >> 32)); - memcpy(out, &b0, 8); - memcpy(out + 8, &b1, 8); + GFp_memcpy(out, &b0, 8); + GFp_memcpy(out + 8, &b1, 8); #else uint32_t a0 = in[0]; uint32_t a1 = in[1]; @@ -411,10 +404,10 @@ static inline void aes_nohw_uncompact_block( b1 = aes_nohw_uncompact_word(b1); b2 = aes_nohw_uncompact_word(b2); b3 = aes_nohw_uncompact_word(b3); - memcpy(out, &b0, 4); - memcpy(out + 4, &b1, 4); - memcpy(out + 8, &b2, 4); - memcpy(out + 12, &b3, 4); + GFp_memcpy(out, &b0, 4); + GFp_memcpy(out + 4, &b1, 4); + GFp_memcpy(out + 8, &b2, 4); + GFp_memcpy(out + 12, &b3, 4); #endif } @@ -482,7 +475,7 @@ static void aes_nohw_transpose(AES_NOHW_BATCH *batch) { static void aes_nohw_to_batch(AES_NOHW_BATCH *out, const uint8_t *in, size_t num_blocks) { // Don't leave unused blocks uninitialized. - memset(out, 0, sizeof(AES_NOHW_BATCH)); + GFp_memset(out, 0, sizeof(AES_NOHW_BATCH)); debug_assert_nonsecret(num_blocks <= AES_NOHW_BATCH_SIZE); for (size_t i = 0; i < num_blocks; i++) { aes_word_t block[AES_NOHW_BLOCK_WORDS]; @@ -777,7 +770,7 @@ static void aes_nohw_expand_round_keys(AES_NOHW_SCHEDULE *out, // Copy the round key into each block in the batch. for (size_t j = 0; j < AES_NOHW_BATCH_SIZE; j++) { aes_word_t tmp[AES_NOHW_BLOCK_WORDS]; - memcpy(tmp, key->rd_key + 4 * i, 16); + GFp_memcpy(tmp, key->rd_key + 4 * i, 16); aes_nohw_batch_set(&out->keys[i], tmp, j); } aes_nohw_transpose(&out->keys[i]); @@ -801,7 +794,7 @@ static inline aes_word_t aes_nohw_rcon_slice(uint8_t rcon, size_t i) { static void aes_nohw_sub_block(aes_word_t out[AES_NOHW_BLOCK_WORDS], const aes_word_t in[AES_NOHW_BLOCK_WORDS]) { AES_NOHW_BATCH batch; - memset(&batch, 0, sizeof(batch)); + GFp_memset(&batch, 0, sizeof(batch)); aes_nohw_batch_set(&batch, in, 0); aes_nohw_transpose(&batch); aes_nohw_sub_bytes(&batch); @@ -814,7 +807,7 @@ static void aes_nohw_setup_key_128(AES_KEY *key, const uint8_t in[16]) { aes_word_t block[AES_NOHW_BLOCK_WORDS]; aes_nohw_compact_block(block, in); - memcpy(key->rd_key, block, 16); + GFp_memcpy(key->rd_key, block, 16); for (size_t i = 1; i <= 10; i++) { aes_word_t sub[AES_NOHW_BLOCK_WORDS]; @@ -833,113 +826,7 @@ static void aes_nohw_setup_key_128(AES_KEY *key, const uint8_t in[16]) { block[j] = aes_nohw_xor(block[j], aes_nohw_shift_left(v, 8)); block[j] = aes_nohw_xor(block[j], aes_nohw_shift_left(v, 12)); } - memcpy(key->rd_key + 4 * i, block, 16); - } -} - -static void aes_nohw_setup_key_192(AES_KEY *key, const uint8_t in[24]) { - key->rounds = 12; - - aes_word_t storage1[AES_NOHW_BLOCK_WORDS], storage2[AES_NOHW_BLOCK_WORDS]; - aes_word_t *block1 = storage1, *block2 = storage2; - - // AES-192's key schedule is complex because each key schedule iteration - // produces six words, but we compute on blocks and each block is four words. - // We maintain a sliding window of two blocks, filled to 1.5 blocks at a time. - // We loop below every three blocks or two key schedule iterations. - // - // On entry to the loop, |block1| and the first half of |block2| contain the - // previous key schedule iteration. |block1| has been written to |key|, but - // |block2| has not as it is incomplete. - aes_nohw_compact_block(block1, in); - memcpy(key->rd_key, block1, 16); - - uint8_t half_block[16] = {0}; - memcpy(half_block, in + 16, 8); - aes_nohw_compact_block(block2, half_block); - - for (size_t i = 0; i < 4; i++) { - aes_word_t sub[AES_NOHW_BLOCK_WORDS]; - aes_nohw_sub_block(sub, block2); - uint8_t rcon = aes_nohw_rcon[2 * i]; - for (size_t j = 0; j < AES_NOHW_BLOCK_WORDS; j++) { - // Compute the first two words of the next key schedule iteration, which - // go in the second half of |block2|. The first two words of the previous - // iteration are in the first half of |block1|. Apply |rcon| here too - // because the shifts match. - block2[j] = aes_nohw_or( - block2[j], - aes_nohw_shift_left( - aes_nohw_xor(block1[j], aes_nohw_rcon_slice(rcon, j)), 8)); - // Incorporate the transformed word and propagate. Note the last word of - // the previous iteration corresponds to the second word of |copy|. This - // is incorporated into the first word of the next iteration, or the third - // word of |block2|. - block2[j] = aes_nohw_xor( - block2[j], aes_nohw_and(aes_nohw_shift_left( - aes_nohw_rotate_rows_down(sub[j]), 4), - AES_NOHW_COL2_MASK)); - block2[j] = aes_nohw_xor( - block2[j], - aes_nohw_and(aes_nohw_shift_left(block2[j], 4), AES_NOHW_COL3_MASK)); - - // Compute the remaining four words, which fill |block1|. Begin by moving - // the corresponding words of the previous iteration: the second half of - // |block1| and the first half of |block2|. - block1[j] = aes_nohw_shift_right(block1[j], 8); - block1[j] = aes_nohw_or(block1[j], aes_nohw_shift_left(block2[j], 8)); - // Incorporate the second word, computed previously in |block2|, and - // propagate. - block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_right(block2[j], 12)); - aes_word_t v = block1[j]; - block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 4)); - block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 8)); - block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 12)); - } - - // This completes two round keys. Note half of |block2| was computed in the - // previous loop iteration but was not yet output. - memcpy(key->rd_key + 4 * (3 * i + 1), block2, 16); - memcpy(key->rd_key + 4 * (3 * i + 2), block1, 16); - - aes_nohw_sub_block(sub, block1); - rcon = aes_nohw_rcon[2 * i + 1]; - for (size_t j = 0; j < AES_NOHW_BLOCK_WORDS; j++) { - // Compute the first four words of the next key schedule iteration in - // |block2|. Begin by moving the corresponding words of the previous - // iteration: the second half of |block2| and the first half of |block1|. - block2[j] = aes_nohw_shift_right(block2[j], 8); - block2[j] = aes_nohw_or(block2[j], aes_nohw_shift_left(block1[j], 8)); - // Incorporate rcon and the transformed word. Note the last word of the - // previous iteration corresponds to the last word of |copy|. - block2[j] = aes_nohw_xor(block2[j], aes_nohw_rcon_slice(rcon, j)); - block2[j] = aes_nohw_xor( - block2[j], - aes_nohw_shift_right(aes_nohw_rotate_rows_down(sub[j]), 12)); - // Propagate to the remaining words. - aes_word_t v = block2[j]; - block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 4)); - block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 8)); - block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 12)); - - // Compute the last two words, which go in the first half of |block1|. The - // last two words of the previous iteration are in the second half of - // |block1|. - block1[j] = aes_nohw_shift_right(block1[j], 8); - // Propagate blocks and mask off the excess. - block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_right(block2[j], 12)); - block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(block1[j], 4)); - block1[j] = aes_nohw_and(block1[j], AES_NOHW_COL01_MASK); - } - - // |block2| has a complete round key. |block1| will be completed in the next - // iteration. - memcpy(key->rd_key + 4 * (3 * i + 3), block2, 16); - - // Swap blocks to restore the invariant. - aes_word_t *tmp = block1; - block1 = block2; - block2 = tmp; + GFp_memcpy(key->rd_key + 4 * i, block, 16); } } @@ -949,10 +836,10 @@ static void aes_nohw_setup_key_256(AES_KEY *key, const uint8_t in[32]) { // Each key schedule iteration produces two round keys. aes_word_t block1[AES_NOHW_BLOCK_WORDS], block2[AES_NOHW_BLOCK_WORDS]; aes_nohw_compact_block(block1, in); - memcpy(key->rd_key, block1, 16); + GFp_memcpy(key->rd_key, block1, 16); aes_nohw_compact_block(block2, in + 16); - memcpy(key->rd_key + 4, block2, 16); + GFp_memcpy(key->rd_key + 4, block2, 16); for (size_t i = 2; i <= 14; i += 2) { aes_word_t sub[AES_NOHW_BLOCK_WORDS]; @@ -970,7 +857,7 @@ static void aes_nohw_setup_key_256(AES_KEY *key, const uint8_t in[32]) { block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 8)); block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 12)); } - memcpy(key->rd_key + 4 * i, block1, 16); + GFp_memcpy(key->rd_key + 4 * i, block1, 16); if (i == 14) { break; @@ -986,7 +873,7 @@ static void aes_nohw_setup_key_256(AES_KEY *key, const uint8_t in[32]) { block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 8)); block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 12)); } - memcpy(key->rd_key + 4 * (i + 1), block2, 16); + GFp_memcpy(key->rd_key + 4 * (i + 1), block2, 16); } } @@ -999,9 +886,6 @@ int GFp_aes_nohw_set_encrypt_key(const uint8_t *key, unsigned bits, case 128: aes_nohw_setup_key_128(aeskey, key); return 0; - case 192: - aes_nohw_setup_key_192(aeskey, key); - return 0; case 256: aes_nohw_setup_key_256(aeskey, key); return 0; @@ -1022,10 +906,10 @@ static inline void aes_nohw_xor_block(uint8_t out[16], const uint8_t a[16], const uint8_t b[16]) { for (size_t i = 0; i < 16; i += sizeof(aes_word_t)) { aes_word_t x, y; - memcpy(&x, a + i, sizeof(aes_word_t)); - memcpy(&y, b + i, sizeof(aes_word_t)); + GFp_memcpy(&x, a + i, sizeof(aes_word_t)); + GFp_memcpy(&y, b + i, sizeof(aes_word_t)); x = aes_nohw_xor(x, y); - memcpy(out + i, &x, sizeof(aes_word_t)); + GFp_memcpy(out + i, &x, sizeof(aes_word_t)); } } @@ -1045,7 +929,7 @@ void GFp_aes_nohw_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out, uint8_t u8[AES_NOHW_BATCH_SIZE * 16]; } ivs, enc_ivs; for (size_t i = 0; i < AES_NOHW_BATCH_SIZE; i++) { - memcpy(ivs.u8 + 16 * i, ivec, 16); + GFp_memcpy(ivs.u8 + 16 * i, ivec, 16); } uint32_t ctr = CRYPTO_bswap4(ivs.u32[3]); diff --git a/crypto/fipsmodule/aes/asm/aesv8-armx.pl b/crypto/fipsmodule/aes/asm/aesv8-armx.pl index c1dcde0c4e..804df8181d 100644 --- a/crypto/fipsmodule/aes/asm/aesv8-armx.pl +++ b/crypto/fipsmodule/aes/asm/aesv8-armx.pl @@ -96,6 +96,8 @@ .Lenc_key: ___ $code.=<<___ if ($flavour =~ /64/); + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. + AARCH64_VALID_CALL_TARGET stp x29,x30,[sp,#-16]! add x29,sp,#0 ___ @@ -249,6 +251,7 @@ () .type GFp_${prefix}_${dir}crypt,%function .align 5 GFp_${prefix}_${dir}crypt: + AARCH64_VALID_CALL_TARGET ldr $rounds,[$key,#240] vld1.32 {$rndkey0},[$key],#16 vld1.8 {$inout},[$inp] @@ -299,6 +302,8 @@ () GFp_${prefix}_ctr32_encrypt_blocks: ___ $code.=<<___ if ($flavour =~ /64/); + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. + AARCH64_VALID_CALL_TARGET stp x29,x30,[sp,#-16]! add x29,sp,#0 ___ @@ -326,20 +331,34 @@ () add $key_,$key,#32 mov $cnt,$rounds cclr $step,lo + + // ARM Cortex-A57 and Cortex-A72 cores running in 32-bit mode are + // affected by silicon errata #1742098 [0] and #1655431 [1], + // respectively, where the second instruction of an aese/aesmc + // instruction pair may execute twice if an interrupt is taken right + // after the first instruction consumes an input register of which a + // single 32-bit lane has been updated the last time it was modified. + // + // This function uses a counter in one 32-bit lane. The vmov.32 lines + // could write to $dat1 and $dat2 directly, but that trips this bugs. + // We write to $ivec and copy to the final register as a workaround. + // + // [0] ARM-EPM-049219 v23 Cortex-A57 MPCore Software Developers Errata Notice + // [1] ARM-EPM-012079 v11.0 Cortex-A72 MPCore Software Developers Errata Notice #ifndef __ARMEB__ rev $ctr, $ctr #endif - vorr $dat1,$dat0,$dat0 add $tctr1, $ctr, #1 - vorr $dat2,$dat0,$dat0 - add $ctr, $ctr, #2 vorr $ivec,$dat0,$dat0 rev $tctr1, $tctr1 - vmov.32 ${dat1}[3],$tctr1 + vmov.32 ${ivec}[3],$tctr1 + add $ctr, $ctr, #2 + vorr $dat1,$ivec,$ivec b.ls .Lctr32_tail rev $tctr2, $ctr + vmov.32 ${ivec}[3],$tctr2 sub $len,$len,#3 // bias - vmov.32 ${dat2}[3],$tctr2 + vorr $dat2,$ivec,$ivec b .Loop3x_ctr32 .align 4 @@ -366,11 +385,11 @@ () aese $dat1,q8 aesmc $tmp1,$dat1 vld1.8 {$in0},[$inp],#16 - vorr $dat0,$ivec,$ivec + add $tctr0,$ctr,#1 aese $dat2,q8 aesmc $dat2,$dat2 vld1.8 {$in1},[$inp],#16 - vorr $dat1,$ivec,$ivec + rev $tctr0,$tctr0 aese $tmp0,q9 aesmc $tmp0,$tmp0 aese $tmp1,q9 @@ -379,8 +398,6 @@ () mov $key_,$key aese $dat2,q9 aesmc $tmp2,$dat2 - vorr $dat2,$ivec,$ivec - add $tctr0,$ctr,#1 aese $tmp0,q12 aesmc $tmp0,$tmp0 aese $tmp1,q12 @@ -395,21 +412,26 @@ () aesmc $tmp0,$tmp0 aese $tmp1,q13 aesmc $tmp1,$tmp1 + // Note the logic to update $dat0, $dat1, and $dat1 is written to work + // around a bug in ARM Cortex-A57 and Cortex-A72 cores running in + // 32-bit mode. See the comment above. veor $in2,$in2,$rndlast - rev $tctr0,$tctr0 + vmov.32 ${ivec}[3], $tctr0 aese $tmp2,q13 aesmc $tmp2,$tmp2 - vmov.32 ${dat0}[3], $tctr0 + vorr $dat0,$ivec,$ivec rev $tctr1,$tctr1 aese $tmp0,q14 aesmc $tmp0,$tmp0 + vmov.32 ${ivec}[3], $tctr1 + rev $tctr2,$ctr aese $tmp1,q14 aesmc $tmp1,$tmp1 - vmov.32 ${dat1}[3], $tctr1 - rev $tctr2,$ctr + vorr $dat1,$ivec,$ivec + vmov.32 ${ivec}[3], $tctr2 aese $tmp2,q14 aesmc $tmp2,$tmp2 - vmov.32 ${dat2}[3], $tctr2 + vorr $dat2,$ivec,$ivec subs $len,$len,#3 aese $tmp0,q15 aese $tmp1,q15 diff --git a/crypto/fipsmodule/aes/asm/vpaes-armv8.pl b/crypto/fipsmodule/aes/asm/vpaes-armv8.pl index 1ab7e0d954..b31bbb81f2 100755 --- a/crypto/fipsmodule/aes/asm/vpaes-armv8.pl +++ b/crypto/fipsmodule/aes/asm/vpaes-armv8.pl @@ -49,6 +49,8 @@ *STDOUT=*OUT; $code.=<<___; +#include + .section .rodata .type _vpaes_consts,%object @@ -237,6 +239,7 @@ .type GFp_vpaes_encrypt,%function .align 4 GFp_vpaes_encrypt: + AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-16]! add x29,sp,#0 @@ -246,6 +249,7 @@ st1 {v0.16b}, [$out] ldp x29,x30,[sp],#16 + AARCH64_VALIDATE_LINK_REGISTER ret .size GFp_vpaes_encrypt,.-GFp_vpaes_encrypt @@ -391,6 +395,7 @@ .type _vpaes_schedule_core,%function .align 4 _vpaes_schedule_core: + AARCH64_SIGN_LINK_REGISTER stp x29, x30, [sp,#-16]! add x29,sp,#0 @@ -550,6 +555,7 @@ eor v6.16b, v6.16b, v6.16b // vpxor %xmm6, %xmm6, %xmm6 eor v7.16b, v7.16b, v7.16b // vpxor %xmm7, %xmm7, %xmm7 ldp x29, x30, [sp],#16 + AARCH64_VALIDATE_LINK_REGISTER ret .size _vpaes_schedule_core,.-_vpaes_schedule_core @@ -720,6 +726,7 @@ .type GFp_vpaes_set_encrypt_key,%function .align 4 GFp_vpaes_set_encrypt_key: + AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-16]! add x29,sp,#0 stp d8,d9,[sp,#-16]! // ABI spec says so @@ -735,6 +742,7 @@ ldp d8,d9,[sp],#16 ldp x29,x30,[sp],#16 + AARCH64_VALIDATE_LINK_REGISTER ret .size GFp_vpaes_set_encrypt_key,.-GFp_vpaes_set_encrypt_key ___ @@ -750,6 +758,7 @@ .type GFp_vpaes_ctr32_encrypt_blocks,%function .align 4 GFp_vpaes_ctr32_encrypt_blocks: + AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-16]! add x29,sp,#0 stp d8,d9,[sp,#-16]! // ABI spec says so @@ -817,6 +826,7 @@ ldp d10,d11,[sp],#16 ldp d8,d9,[sp],#16 ldp x29,x30,[sp],#16 + AARCH64_VALIDATE_LINK_REGISTER ret .size GFp_vpaes_ctr32_encrypt_blocks,.-GFp_vpaes_ctr32_encrypt_blocks ___ diff --git a/crypto/fipsmodule/bn/asm/armv4-mont.pl b/crypto/fipsmodule/bn/asm/armv4-mont.pl index 038006dc6b..dbc28b51d4 100644 --- a/crypto/fipsmodule/bn/asm/armv4-mont.pl +++ b/crypto/fipsmodule/bn/asm/armv4-mont.pl @@ -112,6 +112,8 @@ #endif #if __ARM_MAX_ARCH__>=7 +.extern GFp_armcap_P +.hidden GFp_armcap_P .align 5 .LOPENSSL_armcap: .word GFp_armcap_P-.Lbn_mul_mont @@ -744,11 +746,6 @@ } $code.=<<___; .asciz "Montgomery multiplication for ARMv4/NEON, CRYPTOGAMS by " -.align 2 -#if __ARM_MAX_ARCH__>=7 -.comm GFp_armcap_P,4,4 -.hidden GFp_armcap_P -#endif ___ foreach (split("\n",$code)) { diff --git a/crypto/fipsmodule/bn/asm/armv8-mont.pl b/crypto/fipsmodule/bn/asm/armv8-mont.pl index da93f3aa15..717ea68cf1 100644 --- a/crypto/fipsmodule/bn/asm/armv8-mont.pl +++ b/crypto/fipsmodule/bn/asm/armv8-mont.pl @@ -64,12 +64,15 @@ $num="x5"; # size_t num); $code.=<<___; +#include + .text .globl GFp_bn_mul_mont .type GFp_bn_mul_mont,%function .align 5 GFp_bn_mul_mont: + AARCH64_SIGN_LINK_REGISTER tst $num,#7 b.eq __bn_sqr8x_mont tst $num,#3 @@ -267,6 +270,7 @@ mov x0,#1 ldp x23,x24,[x29,#48] ldr x29,[sp],#64 + AARCH64_VALIDATE_LINK_REGISTER ret .size GFp_bn_mul_mont,.-GFp_bn_mul_mont ___ @@ -284,6 +288,8 @@ .type __bn_sqr8x_mont,%function .align 5 __bn_sqr8x_mont: + // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_sqr8x_mont is jumped to + // only from bn_mul_mont which has already signed the return address. cmp $ap,$bp b.ne __bn_mul4x_mont .Lsqr8x_mont: @@ -1040,6 +1046,8 @@ ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] ldr x29,[sp],#128 + // x30 is popped earlier + AARCH64_VALIDATE_LINK_REGISTER ret .size __bn_sqr8x_mont,.-__bn_sqr8x_mont ___ @@ -1063,6 +1071,9 @@ .type __bn_mul4x_mont,%function .align 5 __bn_mul4x_mont: + // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_mul4x_mont is jumped to + // only from bn_mul_mont or __bn_mul8x_mont which have already signed the + // return address. stp x29,x30,[sp,#-128]! add x29,sp,#0 stp x19,x20,[sp,#16] @@ -1496,6 +1507,8 @@ ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] ldr x29,[sp],#128 + // x30 is popped earlier + AARCH64_VALIDATE_LINK_REGISTER ret .size __bn_mul4x_mont,.-__bn_mul4x_mont ___ diff --git a/crypto/fipsmodule/ec/asm/p256-x86_64-asm.pl b/crypto/fipsmodule/ec/asm/p256-x86_64-asm.pl index e40aaa92d4..f30025e901 100755 --- a/crypto/fipsmodule/ec/asm/p256-x86_64-asm.pl +++ b/crypto/fipsmodule/ec/asm/p256-x86_64-asm.pl @@ -2136,7 +2136,7 @@ $code.=<<___; ################################################################################ -# void GFp_nistz256_select_w5(uint64_t *val, uint64_t *in_t, int index); +# void GFp_nistz256_select_w5(uint64_t *val, uint64_t *in_t, crypto_word index); .globl GFp_nistz256_select_w5 .type GFp_nistz256_select_w5,\@abi-omnipotent .align 32 @@ -2236,7 +2236,7 @@ .size GFp_nistz256_select_w5,.-GFp_nistz256_select_w5 ################################################################################ -# void GFp_nistz256_select_w7(uint64_t *val, uint64_t *in_t, int index); +# void GFp_nistz256_select_w7(uint64_t *val, uint64_t *in_t, crypto_word index); .globl GFp_nistz256_select_w7 .type GFp_nistz256_select_w7,\@abi-omnipotent .align 32 @@ -2333,7 +2333,7 @@ $code.=<<___; ################################################################################ -# void GFp_nistz256_avx2_select_w5(uint64_t *val, uint64_t *in_t, int index); +# void GFp_nistz256_avx2_select_w5(uint64_t *val, uint64_t *in_t, crypto_word index); .type GFp_nistz256_avx2_select_w5,\@abi-omnipotent .align 32 GFp_nistz256_avx2_select_w5: @@ -2440,7 +2440,7 @@ $code.=<<___; ################################################################################ -# void GFp_nistz256_avx2_select_w7(uint64_t *val, uint64_t *in_t, int index); +# void GFp_nistz256_avx2_select_w7(uint64_t *val, uint64_t *in_t, crypto_word index); .globl GFp_nistz256_avx2_select_w7 .type GFp_nistz256_avx2_select_w7,\@abi-omnipotent .align 32 diff --git a/crypto/fipsmodule/ec/ecp_nistz.h b/crypto/fipsmodule/ec/ecp_nistz.h index 74d31007a8..2bcf4b5d4d 100644 --- a/crypto/fipsmodule/ec/ecp_nistz.h +++ b/crypto/fipsmodule/ec/ecp_nistz.h @@ -246,16 +246,16 @@ // P-384: ...01110011; w = 2, 5, 6, 7 are okay // P-256: ...01010001; w = 5, 7 are okay // P-224: ...00111101; w = 3, 4, 5, 6 are okay -static inline void booth_recode(Limb *is_negative, unsigned *digit, - unsigned in, unsigned w) { +static inline void booth_recode(crypto_word *is_negative, crypto_word *digit, + crypto_word in, crypto_word w) { debug_assert_nonsecret(w >= 2); debug_assert_nonsecret(w <= 7); // Set all bits of `s` to MSB(in), similar to |constant_time_msb_s|, // but 'in' seen as (`w+1`)-bit value. - Limb s = ~((in >> w) - 1); - unsigned d; - d = (1 << (w + 1)) - in - 1; + crypto_word s = ~((in >> w) - 1); + crypto_word d; + d = ((crypto_word)1u << (w + 1)) - in - 1; d = (d & s) | (in & ~s); d = (d >> 1) + (d & 1); diff --git a/crypto/fipsmodule/ec/ecp_nistz256.c b/crypto/fipsmodule/ec/ecp_nistz256.c index 34602956d2..b71100cdad 100644 --- a/crypto/fipsmodule/ec/ecp_nistz256.c +++ b/crypto/fipsmodule/ec/ecp_nistz256.c @@ -193,8 +193,8 @@ void GFp_nistz256_point_add(P256_POINT *r, const P256_POINT *a, const P256_POINT void GFp_nistz256_point_mul(P256_POINT *r, const Limb p_scalar[P256_LIMBS], const Limb p_x[P256_LIMBS], const Limb p_y[P256_LIMBS]) { - static const unsigned kWindowSize = 5; - static const unsigned kMask = (1 << (5 /* kWindowSize */ + 1)) - 1; + static const size_t kWindowSize = 5; + static const crypto_word kMask = (1 << (5 /* kWindowSize */ + 1)) - 1; uint8_t p_str[(P256_LIMBS * sizeof(Limb)) + 1]; gfp_little_endian_bytes_from_scalar(p_str, sizeof(p_str) / sizeof(p_str[0]), @@ -232,23 +232,22 @@ void GFp_nistz256_point_mul(P256_POINT *r, const Limb p_scalar[P256_LIMBS], Limb tmp[P256_LIMBS]; alignas(32) P256_POINT h; - static const unsigned START_INDEX = 256 - 1; - unsigned index = START_INDEX; + static const size_t START_INDEX = 256 - 1; + size_t index = START_INDEX; - unsigned raw_wvalue; - Limb recoded_is_negative; - unsigned recoded; + crypto_word raw_wvalue; + crypto_word recoded_is_negative; + crypto_word recoded; raw_wvalue = p_str[(index - 1) / 8]; raw_wvalue = (raw_wvalue >> ((index - 1) % 8)) & kMask; - booth_recode(&recoded_is_negative, &recoded, raw_wvalue, kWindowSize); dev_assert_secret(!recoded_is_negative); GFp_nistz256_select_w5(r, table, recoded); while (index >= kWindowSize) { if (index != START_INDEX) { - unsigned off = (index - 1) / 8; + size_t off = (index - 1) / 8; raw_wvalue = p_str[off] | p_str[off + 1] << 8; raw_wvalue = (raw_wvalue >> ((index - 1) % 8)) & kMask; @@ -286,12 +285,12 @@ void GFp_nistz256_point_mul(P256_POINT *r, const Limb p_scalar[P256_LIMBS], /* Precomputed tables for the default generator */ #include "ecp_nistz256_table.inl" -static const unsigned kWindowSize = 7; +static const size_t kWindowSize = 7; static inline void select_precomputed(P256_POINT_AFFINE *p, size_t i, - unsigned raw_wvalue) { - Limb recoded_is_negative; - unsigned recoded; + crypto_word raw_wvalue) { + crypto_word recoded_is_negative; + crypto_word recoded; booth_recode(&recoded_is_negative, &recoded, raw_wvalue, kWindowSize); GFp_nistz256_select_w7(p, GFp_nistz256_precomputed[i], recoded); Limb neg_y[P256_LIMBS]; @@ -312,18 +311,18 @@ static Limb is_infinity(const Limb x[P256_LIMBS], void GFp_nistz256_point_mul_base(P256_POINT *r, const Limb g_scalar[P256_LIMBS]) { - static const unsigned kMask = (1 << (7 /* kWindowSize */ + 1)) - 1; + static const crypto_word kMask = (1 << (7 /* kWindowSize */ + 1)) - 1; uint8_t p_str[(P256_LIMBS * sizeof(Limb)) + 1]; gfp_little_endian_bytes_from_scalar(p_str, sizeof(p_str) / sizeof(p_str[0]), g_scalar, P256_LIMBS); /* First window */ - unsigned index = kWindowSize; + size_t index = kWindowSize; alignas(32) P256_POINT_AFFINE t; - unsigned raw_wvalue = (p_str[0] << 1) & kMask; + crypto_word raw_wvalue = (p_str[0] << 1) & kMask; select_precomputed(&t, 0, raw_wvalue); alignas(32) P256_POINT p; @@ -334,7 +333,7 @@ void GFp_nistz256_point_mul_base(P256_POINT *r, copy_conditional(p.Z, p.X, is_infinity(p.X, p.Y)); for (size_t i = 1; i < 37; i++) { - unsigned off = (index - 1) / 8; + size_t off = (index - 1) / 8; raw_wvalue = p_str[off] | p_str[off + 1] << 8; raw_wvalue = (raw_wvalue >> ((index - 1) % 8)) & kMask; index += kWindowSize; diff --git a/crypto/fipsmodule/ec/ecp_nistz256.h b/crypto/fipsmodule/ec/ecp_nistz256.h index 01ad2e148d..561d4155f7 100644 --- a/crypto/fipsmodule/ec/ecp_nistz256.h +++ b/crypto/fipsmodule/ec/ecp_nistz256.h @@ -45,10 +45,10 @@ void GFp_nistz256_sqr_mont(Limb res[P256_LIMBS], const Limb a[P256_LIMBS]); /* Functions that perform constant time access to the precomputed tables */ void GFp_nistz256_select_w5(P256_POINT *out, const P256_POINT table[16], - int index); + crypto_word index); #if defined(GFp_USE_LARGE_TABLE) -void GFp_nistz256_select_w7(P256_POINT_AFFINE *out, const PRECOMP256_ROW table, int index); +void GFp_nistz256_select_w7(P256_POINT_AFFINE *out, const PRECOMP256_ROW table, crypto_word index); #endif #endif /* OPENSSL_HEADER_EC_ECP_NISTZ256_H */ diff --git a/crypto/fipsmodule/ec/ecp_nistz384.inl b/crypto/fipsmodule/ec/ecp_nistz384.inl index 718e4a7915..12fc9d9d35 100644 --- a/crypto/fipsmodule/ec/ecp_nistz384.inl +++ b/crypto/fipsmodule/ec/ecp_nistz384.inl @@ -157,10 +157,10 @@ void GFp_nistz384_point_add(P384_POINT *r, const P384_POINT *a, limbs_copy(r->Z, res_z, P384_LIMBS); } -static void add_precomputed_w5(P384_POINT *r, unsigned wvalue, +static void add_precomputed_w5(P384_POINT *r, crypto_word wvalue, const P384_POINT table[16]) { - BN_ULONG recoded_is_negative; - unsigned int recoded; + crypto_word recoded_is_negative; + crypto_word recoded; booth_recode(&recoded_is_negative, &recoded, wvalue, 5); alignas(64) P384_POINT h; @@ -177,8 +177,8 @@ static void add_precomputed_w5(P384_POINT *r, unsigned wvalue, void GFp_nistz384_point_mul(P384_POINT *r, const BN_ULONG p_scalar[P384_LIMBS], const BN_ULONG p_x[P384_LIMBS], const BN_ULONG p_y[P384_LIMBS]) { - static const unsigned kWindowSize = 5; - static const unsigned kMask = (1 << (5 /* kWindowSize */ + 1)) - 1; + static const size_t kWindowSize = 5; + static const crypto_word kMask = (1 << (5 /* kWindowSize */ + 1)) - 1; uint8_t p_str[(P384_LIMBS * sizeof(Limb)) + 1]; gfp_little_endian_bytes_from_scalar(p_str, sizeof(p_str) / sizeof(p_str[0]), @@ -214,13 +214,13 @@ void GFp_nistz384_point_mul(P384_POINT *r, const BN_ULONG p_scalar[P384_LIMBS], GFp_nistz384_point_add(&row[11 - 1], &row[10 - 1], &row[1 - 1]); GFp_nistz384_point_double(&row[16 - 1], &row[8 - 1]); - static const unsigned START_INDEX = 384 - 4; - unsigned index = START_INDEX; + static const size_t START_INDEX = 384 - 4; + size_t index = START_INDEX; BN_ULONG recoded_is_negative; - unsigned recoded; + crypto_word recoded; - unsigned wvalue = p_str[(index - 1) / 8]; + crypto_word wvalue = p_str[(index - 1) / 8]; wvalue = (wvalue >> ((index - 1) % 8)) & kMask; booth_recode(&recoded_is_negative, &recoded, wvalue, 5); @@ -230,7 +230,7 @@ void GFp_nistz384_point_mul(P384_POINT *r, const BN_ULONG p_scalar[P384_LIMBS], while (index >= kWindowSize) { if (index != START_INDEX) { - unsigned off = (index - 1) / 8; + size_t off = (index - 1) / 8; wvalue = p_str[off] | p_str[off + 1] << 8; wvalue = (wvalue >> ((index - 1) % 8)) & kMask; diff --git a/crypto/fipsmodule/ec/gfp_p256.c b/crypto/fipsmodule/ec/gfp_p256.c index 5e9046a960..60678ec6d9 100644 --- a/crypto/fipsmodule/ec/gfp_p256.c +++ b/crypto/fipsmodule/ec/gfp_p256.c @@ -73,9 +73,8 @@ void GFp_p256_scalar_sqr_rep_mont(ScalarMont r, const ScalarMont a, Limb rep) { /* TODO(perf): Optimize these. */ void GFp_nistz256_select_w5(P256_POINT *out, const P256_POINT table[16], - int index) { + crypto_word index) { dev_assert_secret(index >= 0); - size_t index_s = (size_t)index; /* XXX: constant time? */ alignas(32) Elem x; limbs_zero(x, P256_LIMBS); alignas(32) Elem y; limbs_zero(y, P256_LIMBS); @@ -83,7 +82,7 @@ void GFp_nistz256_select_w5(P256_POINT *out, const P256_POINT table[16], // TODO: Rewrite in terms of |limbs_select|. for (size_t i = 0; i < 16; ++i) { - Limb equal = constant_time_eq_w(index_s, i + 1); + crypto_word equal = constant_time_eq_w(index, (crypto_word)i + 1); for (size_t j = 0; j < P256_LIMBS; ++j) { x[j] = constant_time_select_w(equal, table[i].X[j], x[j]); y[j] = constant_time_select_w(equal, table[i].Y[j], y[j]); @@ -98,12 +97,9 @@ void GFp_nistz256_select_w5(P256_POINT *out, const P256_POINT table[16], #if defined GFp_USE_LARGE_TABLE void GFp_nistz256_select_w7(P256_POINT_AFFINE *out, - const PRECOMP256_ROW table, int index) { - dev_assert_secret(index >= 0); - size_t index_as_s = (size_t)index; /* XXX: constant time? */ - + const PRECOMP256_ROW table, crypto_word index) { alignas(32) Limb xy[P256_LIMBS * 2]; - limbs_select(xy, table, P256_LIMBS * 2, 64, index_as_s - 1); + limbs_select(xy, table, P256_LIMBS * 2, 64, index - 1); limbs_copy(out->X, &xy[0], P256_LIMBS); limbs_copy(out->Y, &xy[P256_LIMBS], P256_LIMBS); } diff --git a/crypto/fipsmodule/ec/gfp_p384.c b/crypto/fipsmodule/ec/gfp_p384.c index 641f4a70cd..820fac4a15 100644 --- a/crypto/fipsmodule/ec/gfp_p384.c +++ b/crypto/fipsmodule/ec/gfp_p384.c @@ -225,7 +225,7 @@ static void gfp_p384_point_select_w5(P384_POINT *out, // TODO: Rewrite in terms of |limbs_select|. for (size_t i = 0; i < 16; ++i) { - Limb equal = constant_time_eq_w(index, i + 1); + crypto_word equal = constant_time_eq_w(index, (crypto_word)i + 1); for (size_t j = 0; j < P384_LIMBS; ++j) { x[j] = constant_time_select_w(equal, table[i].X[j], x[j]); y[j] = constant_time_select_w(equal, table[i].Y[j], y[j]); diff --git a/crypto/fipsmodule/modes/asm/ghash-neon-armv8.pl b/crypto/fipsmodule/modes/asm/ghash-neon-armv8.pl index f90550b06c..7e52ad667f 100644 --- a/crypto/fipsmodule/modes/asm/ghash-neon-armv8.pl +++ b/crypto/fipsmodule/modes/asm/ghash-neon-armv8.pl @@ -157,12 +157,15 @@ sub clmul64x64 { } $code .= <<___; +#include + .text .global GFp_gcm_init_neon .type GFp_gcm_init_neon,%function .align 4 GFp_gcm_init_neon: + AARCH64_VALID_CALL_TARGET // This function is adapted from gcm_init_v8. xC2 is t3. ld1 {$t1.2d}, [x1] // load H movi $t3.16b, #0xe1 @@ -187,6 +190,7 @@ sub clmul64x64 { .type GFp_gcm_gmult_neon,%function .align 4 GFp_gcm_gmult_neon: + AARCH64_VALID_CALL_TARGET ld1 {$INlo.16b}, [$Xi] // load Xi ld1 {$Hlo.1d}, [$Htbl], #8 // load twisted H ld1 {$Hhi.1d}, [$Htbl] @@ -205,6 +209,7 @@ sub clmul64x64 { .type GFp_gcm_ghash_neon,%function .align 4 GFp_gcm_ghash_neon: + AARCH64_VALID_CALL_TARGET ld1 {$Xl.16b}, [$Xi] // load Xi ld1 {$Hlo.1d}, [$Htbl], #8 // load twisted H ld1 {$Hhi.1d}, [$Htbl] diff --git a/crypto/fipsmodule/modes/asm/ghashv8-armx.pl b/crypto/fipsmodule/modes/asm/ghashv8-armx.pl index a477cae8fd..3a551c2901 100644 --- a/crypto/fipsmodule/modes/asm/ghashv8-armx.pl +++ b/crypto/fipsmodule/modes/asm/ghashv8-armx.pl @@ -86,6 +86,7 @@ .type GFp_gcm_init_clmul,%function .align 4 GFp_gcm_init_clmul: + AARCH64_VALID_CALL_TARGET vld1.64 {$t1},[x1] @ load input H vmov.i8 $xC2,#0xe1 vshl.i64 $xC2,$xC2,#57 @ 0xc2.0 @@ -145,6 +146,7 @@ .type GFp_gcm_gmult_clmul,%function .align 4 GFp_gcm_gmult_clmul: + AARCH64_VALID_CALL_TARGET vld1.64 {$t1},[$Xi] @ load Xi vmov.i8 $xC2,#0xe1 vld1.64 {$H-$Hhl},[$Htbl] @ load twisted H, ... @@ -198,6 +200,7 @@ .type GFp_gcm_ghash_clmul,%function .align 4 GFp_gcm_ghash_clmul: + AARCH64_VALID_CALL_TARGET ___ $code.=<<___ if ($flavour !~ /64/); vstmdb sp!,{d8-d15} @ 32-bit ABI says so diff --git a/crypto/fipsmodule/modes/internal.h b/crypto/fipsmodule/modes/internal.h deleted file mode 100644 index efccd24cd6..0000000000 --- a/crypto/fipsmodule/modes/internal.h +++ /dev/null @@ -1,57 +0,0 @@ -/* ==================================================================== - * Copyright (c) 2008 The OpenSSL Project. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. All advertising materials mentioning features or use of this - * software must display the following acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" - * - * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to - * endorse or promote products derived from this software without - * prior written permission. For written permission, please contact - * openssl-core@openssl.org. - * - * 5. Products derived from this software may not be called "OpenSSL" - * nor may "OpenSSL" appear in their names without prior written - * permission of the OpenSSL Project. - * - * 6. Redistributions of any form whatsoever must retain the following - * acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit (http://www.openssl.org/)" - * - * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY - * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. - * ==================================================================== */ - -#ifndef OPENSSL_HEADER_MODES_INTERNAL_H -#define OPENSSL_HEADER_MODES_INTERNAL_H - -#include "../../internal.h" - -// GCM definitions -typedef struct { uint64_t hi,lo; } u128; - -#endif // OPENSSL_HEADER_MODES_INTERNAL_H diff --git a/crypto/fipsmodule/sha/asm/sha256-armv4.pl b/crypto/fipsmodule/sha/asm/sha256-armv4.pl index d71fc82e22..d8661a0911 100644 --- a/crypto/fipsmodule/sha/asm/sha256-armv4.pl +++ b/crypto/fipsmodule/sha/asm/sha256-armv4.pl @@ -218,6 +218,8 @@ sub BODY_16_XX { .size K256,.-K256 .word 0 @ terminator #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) +.extern GFp_armcap_P +.hidden GFp_armcap_P .LOPENSSL_armcap: .word GFp_armcap_P-.Lsha256_block_data_order #endif @@ -687,11 +689,6 @@ () }}} $code.=<<___; .asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by " -.align 2 -#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) -.comm GFp_armcap_P,4,4 -.hidden GFp_armcap_P -#endif ___ open SELF,$0; diff --git a/crypto/fipsmodule/sha/asm/sha512-armv4.pl b/crypto/fipsmodule/sha/asm/sha512-armv4.pl index 4543f4566c..21c7ebddba 100644 --- a/crypto/fipsmodule/sha/asm/sha512-armv4.pl +++ b/crypto/fipsmodule/sha/asm/sha512-armv4.pl @@ -278,6 +278,8 @@ () WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817) .size K512,.-K512 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) +.extern GFp_armcap_P +.hidden GFp_armcap_P .LOPENSSL_armcap: .word GFp_armcap_P-.Lsha512_block_data_order .skip 32-4 @@ -651,11 +653,6 @@ () } $code.=<<___; .asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by " -.align 2 -#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) -.comm GFp_armcap_P,4,4 -.hidden GFp_armcap_P -#endif ___ $code =~ s/\`([^\`]*)\`/eval $1/gem; diff --git a/crypto/fipsmodule/sha/asm/sha512-armv8.pl b/crypto/fipsmodule/sha/asm/sha512-armv8.pl index d8667c8db8..bb80b7c96b 100644 --- a/crypto/fipsmodule/sha/asm/sha512-armv8.pl +++ b/crypto/fipsmodule/sha/asm/sha512-armv8.pl @@ -179,12 +179,14 @@ sub BODY_00_xx { .text .extern GFp_armcap_P +.hidden GFp_armcap_P .globl $func .type $func,%function .align 6 $func: ___ $code.=<<___ if ($SZ==4); + AARCH64_VALID_CALL_TARGET #ifndef __KERNEL__ #if __has_feature(hwaddress_sanitizer) && __clang_major__ >= 10 adrp x16,:pg_hi21_nc:GFp_armcap_P @@ -197,6 +199,7 @@ sub BODY_00_xx { #endif ___ $code.=<<___; + AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-128]! add x29,sp,#0 @@ -259,6 +262,7 @@ sub BODY_00_xx { ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] ldp x29,x30,[sp],#128 + AARCH64_VALIDATE_LINK_REGISTER ret .size $func,.-$func @@ -350,6 +354,7 @@ sub BODY_00_xx { .align 6 sha256_block_armv8: .Lv8_entry: + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. stp x29,x30,[sp,#-16]! add x29,sp,#0 @@ -419,13 +424,6 @@ sub BODY_00_xx { ___ } -$code.=<<___; -#ifndef __KERNEL__ -.comm GFp_armcap_P,4,4 -.hidden GFp_armcap_P -#endif -___ - { my %opcode = ( "sha256h" => 0x5e004000, "sha256h2" => 0x5e005000, "sha256su0" => 0x5e282800, "sha256su1" => 0x5e006000 ); diff --git a/crypto/internal.h b/crypto/internal.h index 57607bfc38..1877bec7f6 100644 --- a/crypto/internal.h +++ b/crypto/internal.h @@ -259,10 +259,39 @@ static inline uint32_t CRYPTO_bswap4(uint32_t x) { } #endif -static inline void bytes_copy(uint8_t out[], const uint8_t in[], size_t len) { - for (size_t i = 0; i < len; ++i) { - out[i] = in[i]; +#if !defined(GFp_NOSTDLIBINC) +#include +#endif + +static inline void *GFp_memcpy(void *dst, const void *src, size_t n) { +#if !defined(GFp_NOSTDLIBINC) + if (n == 0) { + return dst; + } + return memcpy(dst, src, n); +#else + unsigned char *d = dst; + const unsigned char *s = src; + for (size_t i = 0; i < n; ++i) { + d[i] = s[i]; } + return dst; +#endif +} + +static inline void *GFp_memset(void *dst, int c, size_t n) { +#if !defined(GFp_NOSTDLIBINC) + if (n == 0) { + return dst; + } + return memset(dst, c, n); +#else + unsigned char *d = dst; + for (size_t i = 0; i < n; ++i) { + d[i] = (unsigned char)c; + } + return dst; +#endif } #endif // OPENSSL_HEADER_CRYPTO_INTERNAL_H diff --git a/crypto/perlasm/.gitattributes b/crypto/perlasm/.gitattributes deleted file mode 100644 index d77060900d..0000000000 --- a/crypto/perlasm/.gitattributes +++ /dev/null @@ -1,2 +0,0 @@ -*.pl linguist-language=Perl - diff --git a/crypto/poly1305/asm/poly1305-armv4.pl b/crypto/poly1305/asm/poly1305-armv4.pl deleted file mode 100755 index 1265676bbb..0000000000 --- a/crypto/poly1305/asm/poly1305-armv4.pl +++ /dev/null @@ -1,1246 +0,0 @@ -#!/usr/bin/env perl -# -# ==================================================================== -# Written by Andy Polyakov for the OpenSSL -# project. The module is, however, dual licensed under OpenSSL and -# CRYPTOGAMS licenses depending on where you obtain it. For further -# details see http://www.openssl.org/~appro/cryptogams/. -# ==================================================================== -# -# IALU(*)/gcc-4.4 NEON -# -# ARM11xx(ARMv6) 7.78/+100% - -# Cortex-A5 6.35/+130% 3.00 -# Cortex-A8 6.25/+115% 2.36 -# Cortex-A9 5.10/+95% 2.55 -# Cortex-A15 3.85/+85% 1.25(**) -# Snapdragon S4 5.70/+100% 1.48(**) -# -# (*) this is for -march=armv6, i.e. with bunch of ldrb loading data; -# (**) these are trade-off results, they can be improved by ~8% but at -# the cost of 15/12% regression on Cortex-A5/A7, it's even possible -# to improve Cortex-A9 result, but then A5/A7 loose more than 20%; - -$flavour = shift; -if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } -else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } - -if ($flavour && $flavour ne "void") { - $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; - ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or - ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or - die "can't locate arm-xlate.pl"; - - open STDOUT,"| \"$^X\" $xlate $flavour $output"; -} else { - open STDOUT,">$output"; -} - -($ctx,$inp,$len,$padbit)=map("r$_",(0..3)); - -$code.=<<___; -#include - -.text -#if defined(__thumb2__) -.syntax unified -.thumb -#else -.code 32 -#endif - -.globl GFp_poly1305_emit -.globl GFp_poly1305_blocks -.globl GFp_poly1305_init_asm - -.type GFp_poly1305_init_asm,%function -.align 5 -GFp_poly1305_init_asm: -.Lpoly1305_init: - stmdb sp!,{r4-r11} - - eor r3,r3,r3 - cmp $inp,#0 - str r3,[$ctx,#0] @ zero hash value - str r3,[$ctx,#4] - str r3,[$ctx,#8] - str r3,[$ctx,#12] - str r3,[$ctx,#16] - str r3,[$ctx,#36] @ is_base2_26 - add $ctx,$ctx,#20 - -#ifdef __thumb2__ - it eq -#endif - moveq r0,#0 - beq .Lno_key - -#if __ARM_MAX_ARCH__>=7 - adr r11,.Lpoly1305_init - ldr r12,.LOPENSSL_armcap -#endif - ldrb r4,[$inp,#0] - mov r10,#0x0fffffff - ldrb r5,[$inp,#1] - and r3,r10,#-4 @ 0x0ffffffc - ldrb r6,[$inp,#2] - ldrb r7,[$inp,#3] - orr r4,r4,r5,lsl#8 - ldrb r5,[$inp,#4] - orr r4,r4,r6,lsl#16 - ldrb r6,[$inp,#5] - orr r4,r4,r7,lsl#24 - ldrb r7,[$inp,#6] - and r4,r4,r10 - -#if __ARM_MAX_ARCH__>=7 - ldr r12,[r11,r12] @ GFp_armcap_P -# ifdef __APPLE__ - ldr r12,[r12] -# endif -#endif - ldrb r8,[$inp,#7] - orr r5,r5,r6,lsl#8 - ldrb r6,[$inp,#8] - orr r5,r5,r7,lsl#16 - ldrb r7,[$inp,#9] - orr r5,r5,r8,lsl#24 - ldrb r8,[$inp,#10] - and r5,r5,r3 - -#if __ARM_MAX_ARCH__>=7 - tst r12,#ARMV7_NEON @ check for NEON -# ifdef __APPLE__ - adr r9,poly1305_blocks_neon - adr r11,GFp_poly1305_blocks -# ifdef __thumb2__ - it ne -# endif - movne r11,r9 - adr r12,GFp_poly1305_emit - adr r10,poly1305_emit_neon -# ifdef __thumb2__ - it ne -# endif - movne r12,r10 -# else -# ifdef __thumb2__ - itete eq -# endif - addeq r12,r11,#(GFp_poly1305_emit-.Lpoly1305_init) - addne r12,r11,#(poly1305_emit_neon-.Lpoly1305_init) - addeq r11,r11,#(GFp_poly1305_blocks-.Lpoly1305_init) - addne r11,r11,#(poly1305_blocks_neon-.Lpoly1305_init) -# endif -# ifdef __thumb2__ - orr r12,r12,#1 @ thumb-ify address - orr r11,r11,#1 -# endif -#endif - ldrb r9,[$inp,#11] - orr r6,r6,r7,lsl#8 - ldrb r7,[$inp,#12] - orr r6,r6,r8,lsl#16 - ldrb r8,[$inp,#13] - orr r6,r6,r9,lsl#24 - ldrb r9,[$inp,#14] - and r6,r6,r3 - - ldrb r10,[$inp,#15] - orr r7,r7,r8,lsl#8 - str r4,[$ctx,#0] - orr r7,r7,r9,lsl#16 - str r5,[$ctx,#4] - orr r7,r7,r10,lsl#24 - str r6,[$ctx,#8] - and r7,r7,r3 - str r7,[$ctx,#12] -#if __ARM_MAX_ARCH__>=7 - stmia r2,{r11,r12} @ fill functions table - mov r0,#1 -#else - mov r0,#0 -#endif -.Lno_key: - ldmia sp!,{r4-r11} -#if __ARM_ARCH__>=5 - ret @ bx lr -#else - tst lr,#1 - moveq pc,lr @ be binary compatible with V4, yet - bx lr @ interoperable with Thumb ISA:-) -#endif -.size GFp_poly1305_init_asm,.-GFp_poly1305_init_asm -___ -{ -my ($h0,$h1,$h2,$h3,$h4,$r0,$r1,$r2,$r3)=map("r$_",(4..12)); -my ($s1,$s2,$s3)=($r1,$r2,$r3); - -$code.=<<___; -.type GFp_poly1305_blocks,%function -.align 5 -GFp_poly1305_blocks: - stmdb sp!,{r3-r11,lr} - - ands $len,$len,#-16 - beq .Lno_data - - cmp $padbit,#0 - add $len,$len,$inp @ end pointer - sub sp,sp,#32 - - ldmia $ctx,{$h0-$r3} @ load context - - str $ctx,[sp,#12] @ offload stuff - mov lr,$inp - str $len,[sp,#16] - str $r1,[sp,#20] - str $r2,[sp,#24] - str $r3,[sp,#28] - b .Loop - -.Loop: -#if __ARM_ARCH__<7 - ldrb r0,[lr],#16 @ load input -# ifdef __thumb2__ - it hi -# endif - addhi $h4,$h4,#1 @ 1<<128 - ldrb r1,[lr,#-15] - ldrb r2,[lr,#-14] - ldrb r3,[lr,#-13] - orr r1,r0,r1,lsl#8 - ldrb r0,[lr,#-12] - orr r2,r1,r2,lsl#16 - ldrb r1,[lr,#-11] - orr r3,r2,r3,lsl#24 - ldrb r2,[lr,#-10] - adds $h0,$h0,r3 @ accumulate input - - ldrb r3,[lr,#-9] - orr r1,r0,r1,lsl#8 - ldrb r0,[lr,#-8] - orr r2,r1,r2,lsl#16 - ldrb r1,[lr,#-7] - orr r3,r2,r3,lsl#24 - ldrb r2,[lr,#-6] - adcs $h1,$h1,r3 - - ldrb r3,[lr,#-5] - orr r1,r0,r1,lsl#8 - ldrb r0,[lr,#-4] - orr r2,r1,r2,lsl#16 - ldrb r1,[lr,#-3] - orr r3,r2,r3,lsl#24 - ldrb r2,[lr,#-2] - adcs $h2,$h2,r3 - - ldrb r3,[lr,#-1] - orr r1,r0,r1,lsl#8 - str lr,[sp,#8] @ offload input pointer - orr r2,r1,r2,lsl#16 - add $s1,$r1,$r1,lsr#2 - orr r3,r2,r3,lsl#24 -#else - ldr r0,[lr],#16 @ load input -# ifdef __thumb2__ - it hi -# endif - addhi $h4,$h4,#1 @ padbit - ldr r1,[lr,#-12] - ldr r2,[lr,#-8] - ldr r3,[lr,#-4] -# ifdef __ARMEB__ - rev r0,r0 - rev r1,r1 - rev r2,r2 - rev r3,r3 -# endif - adds $h0,$h0,r0 @ accumulate input - str lr,[sp,#8] @ offload input pointer - adcs $h1,$h1,r1 - add $s1,$r1,$r1,lsr#2 - adcs $h2,$h2,r2 -#endif - add $s2,$r2,$r2,lsr#2 - adcs $h3,$h3,r3 - add $s3,$r3,$r3,lsr#2 - - umull r2,r3,$h1,$r0 - adc $h4,$h4,#0 - umull r0,r1,$h0,$r0 - umlal r2,r3,$h4,$s1 - umlal r0,r1,$h3,$s1 - ldr $r1,[sp,#20] @ reload $r1 - umlal r2,r3,$h2,$s3 - umlal r0,r1,$h1,$s3 - umlal r2,r3,$h3,$s2 - umlal r0,r1,$h2,$s2 - umlal r2,r3,$h0,$r1 - str r0,[sp,#0] @ future $h0 - mul r0,$s2,$h4 - ldr $r2,[sp,#24] @ reload $r2 - adds r2,r2,r1 @ d1+=d0>>32 - eor r1,r1,r1 - adc lr,r3,#0 @ future $h2 - str r2,[sp,#4] @ future $h1 - - mul r2,$s3,$h4 - eor r3,r3,r3 - umlal r0,r1,$h3,$s3 - ldr $r3,[sp,#28] @ reload $r3 - umlal r2,r3,$h3,$r0 - umlal r0,r1,$h2,$r0 - umlal r2,r3,$h2,$r1 - umlal r0,r1,$h1,$r1 - umlal r2,r3,$h1,$r2 - umlal r0,r1,$h0,$r2 - umlal r2,r3,$h0,$r3 - ldr $h0,[sp,#0] - mul $h4,$r0,$h4 - ldr $h1,[sp,#4] - - adds $h2,lr,r0 @ d2+=d1>>32 - ldr lr,[sp,#8] @ reload input pointer - adc r1,r1,#0 - adds $h3,r2,r1 @ d3+=d2>>32 - ldr r0,[sp,#16] @ reload end pointer - adc r3,r3,#0 - add $h4,$h4,r3 @ h4+=d3>>32 - - and r1,$h4,#-4 - and $h4,$h4,#3 - add r1,r1,r1,lsr#2 @ *=5 - adds $h0,$h0,r1 - adcs $h1,$h1,#0 - adcs $h2,$h2,#0 - adcs $h3,$h3,#0 - adc $h4,$h4,#0 - - cmp r0,lr @ done yet? - bhi .Loop - - ldr $ctx,[sp,#12] - add sp,sp,#32 - stmia $ctx,{$h0-$h4} @ store the result - -.Lno_data: -#if __ARM_ARCH__>=5 - ldmia sp!,{r3-r11,pc} -#else - ldmia sp!,{r3-r11,lr} - tst lr,#1 - moveq pc,lr @ be binary compatible with V4, yet - bx lr @ interoperable with Thumb ISA:-) -#endif -.size GFp_poly1305_blocks,.-GFp_poly1305_blocks -___ -} -{ -my ($ctx,$mac,$nonce)=map("r$_",(0..2)); -my ($h0,$h1,$h2,$h3,$h4,$g0,$g1,$g2,$g3)=map("r$_",(3..11)); -my $g4=$h4; - -$code.=<<___; -.type GFp_poly1305_emit,%function -.align 5 -GFp_poly1305_emit: - stmdb sp!,{r4-r11} -.Lpoly1305_emit_enter: - - ldmia $ctx,{$h0-$h4} - adds $g0,$h0,#5 @ compare to modulus - adcs $g1,$h1,#0 - adcs $g2,$h2,#0 - adcs $g3,$h3,#0 - adc $g4,$h4,#0 - tst $g4,#4 @ did it carry/borrow? - -#ifdef __thumb2__ - it ne -#endif - movne $h0,$g0 - ldr $g0,[$nonce,#0] -#ifdef __thumb2__ - it ne -#endif - movne $h1,$g1 - ldr $g1,[$nonce,#4] -#ifdef __thumb2__ - it ne -#endif - movne $h2,$g2 - ldr $g2,[$nonce,#8] -#ifdef __thumb2__ - it ne -#endif - movne $h3,$g3 - ldr $g3,[$nonce,#12] - - adds $h0,$h0,$g0 - adcs $h1,$h1,$g1 - adcs $h2,$h2,$g2 - adc $h3,$h3,$g3 - -#if __ARM_ARCH__>=7 -# ifdef __ARMEB__ - rev $h0,$h0 - rev $h1,$h1 - rev $h2,$h2 - rev $h3,$h3 -# endif - str $h0,[$mac,#0] - str $h1,[$mac,#4] - str $h2,[$mac,#8] - str $h3,[$mac,#12] -#else - strb $h0,[$mac,#0] - mov $h0,$h0,lsr#8 - strb $h1,[$mac,#4] - mov $h1,$h1,lsr#8 - strb $h2,[$mac,#8] - mov $h2,$h2,lsr#8 - strb $h3,[$mac,#12] - mov $h3,$h3,lsr#8 - - strb $h0,[$mac,#1] - mov $h0,$h0,lsr#8 - strb $h1,[$mac,#5] - mov $h1,$h1,lsr#8 - strb $h2,[$mac,#9] - mov $h2,$h2,lsr#8 - strb $h3,[$mac,#13] - mov $h3,$h3,lsr#8 - - strb $h0,[$mac,#2] - mov $h0,$h0,lsr#8 - strb $h1,[$mac,#6] - mov $h1,$h1,lsr#8 - strb $h2,[$mac,#10] - mov $h2,$h2,lsr#8 - strb $h3,[$mac,#14] - mov $h3,$h3,lsr#8 - - strb $h0,[$mac,#3] - strb $h1,[$mac,#7] - strb $h2,[$mac,#11] - strb $h3,[$mac,#15] -#endif - ldmia sp!,{r4-r11} -#if __ARM_ARCH__>=5 - ret @ bx lr -#else - tst lr,#1 - moveq pc,lr @ be binary compatible with V4, yet - bx lr @ interoperable with Thumb ISA:-) -#endif -.size GFp_poly1305_emit,.-GFp_poly1305_emit -___ -{ -my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("d$_",(0..9)); -my ($D0,$D1,$D2,$D3,$D4, $H0,$H1,$H2,$H3,$H4) = map("q$_",(5..14)); -my ($T0,$T1,$MASK) = map("q$_",(15,4,0)); - -my ($in2,$zeros,$tbl0,$tbl1) = map("r$_",(4..7)); - -$code.=<<___; -#if __ARM_MAX_ARCH__>=7 -.fpu neon - -.type poly1305_init_neon,%function -.align 5 -poly1305_init_neon: - ldr r4,[$ctx,#20] @ load key base 2^32 - ldr r5,[$ctx,#24] - ldr r6,[$ctx,#28] - ldr r7,[$ctx,#32] - - and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26 - mov r3,r4,lsr#26 - mov r4,r5,lsr#20 - orr r3,r3,r5,lsl#6 - mov r5,r6,lsr#14 - orr r4,r4,r6,lsl#12 - mov r6,r7,lsr#8 - orr r5,r5,r7,lsl#18 - and r3,r3,#0x03ffffff - and r4,r4,#0x03ffffff - and r5,r5,#0x03ffffff - - vdup.32 $R0,r2 @ r^1 in both lanes - add r2,r3,r3,lsl#2 @ *5 - vdup.32 $R1,r3 - add r3,r4,r4,lsl#2 - vdup.32 $S1,r2 - vdup.32 $R2,r4 - add r4,r5,r5,lsl#2 - vdup.32 $S2,r3 - vdup.32 $R3,r5 - add r5,r6,r6,lsl#2 - vdup.32 $S3,r4 - vdup.32 $R4,r6 - vdup.32 $S4,r5 - - mov $zeros,#2 @ counter - -.Lsquare_neon: - @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ - @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 - @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 - @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 - @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 - @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 - - vmull.u32 $D0,$R0,${R0}[1] - vmull.u32 $D1,$R1,${R0}[1] - vmull.u32 $D2,$R2,${R0}[1] - vmull.u32 $D3,$R3,${R0}[1] - vmull.u32 $D4,$R4,${R0}[1] - - vmlal.u32 $D0,$R4,${S1}[1] - vmlal.u32 $D1,$R0,${R1}[1] - vmlal.u32 $D2,$R1,${R1}[1] - vmlal.u32 $D3,$R2,${R1}[1] - vmlal.u32 $D4,$R3,${R1}[1] - - vmlal.u32 $D0,$R3,${S2}[1] - vmlal.u32 $D1,$R4,${S2}[1] - vmlal.u32 $D3,$R1,${R2}[1] - vmlal.u32 $D2,$R0,${R2}[1] - vmlal.u32 $D4,$R2,${R2}[1] - - vmlal.u32 $D0,$R2,${S3}[1] - vmlal.u32 $D3,$R0,${R3}[1] - vmlal.u32 $D1,$R3,${S3}[1] - vmlal.u32 $D2,$R4,${S3}[1] - vmlal.u32 $D4,$R1,${R3}[1] - - vmlal.u32 $D3,$R4,${S4}[1] - vmlal.u32 $D0,$R1,${S4}[1] - vmlal.u32 $D1,$R2,${S4}[1] - vmlal.u32 $D2,$R3,${S4}[1] - vmlal.u32 $D4,$R0,${R4}[1] - - @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ - @ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein - @ and P. Schwabe - @ - @ H0>>+H1>>+H2>>+H3>>+H4 - @ H3>>+H4>>*5+H0>>+H1 - @ - @ Trivia. - @ - @ Result of multiplication of n-bit number by m-bit number is - @ n+m bits wide. However! Even though 2^n is a n+1-bit number, - @ m-bit number multiplied by 2^n is still n+m bits wide. - @ - @ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2, - @ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit - @ one is n+1 bits wide. - @ - @ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that - @ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4 - @ can be 27. However! In cases when their width exceeds 26 bits - @ they are limited by 2^26+2^6. This in turn means that *sum* - @ of the products with these values can still be viewed as sum - @ of 52-bit numbers as long as the amount of addends is not a - @ power of 2. For example, - @ - @ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4, - @ - @ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or - @ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than - @ 8 * (2^52) or 2^55. However, the value is then multiplied by - @ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12), - @ which is less than 32 * (2^52) or 2^57. And when processing - @ data we are looking at triple as many addends... - @ - @ In key setup procedure pre-reduced H0 is limited by 5*4+1 and - @ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the - @ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while - @ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32 - @ instruction accepts 2x32-bit input and writes 2x64-bit result. - @ This means that result of reduction have to be compressed upon - @ loop wrap-around. This can be done in the process of reduction - @ to minimize amount of instructions [as well as amount of - @ 128-bit instructions, which benefits low-end processors], but - @ one has to watch for H2 (which is narrower than H0) and 5*H4 - @ not being wider than 58 bits, so that result of right shift - @ by 26 bits fits in 32 bits. This is also useful on x86, - @ because it allows to use paddd in place for paddq, which - @ benefits Atom, where paddq is ridiculously slow. - - vshr.u64 $T0,$D3,#26 - vmovn.i64 $D3#lo,$D3 - vshr.u64 $T1,$D0,#26 - vmovn.i64 $D0#lo,$D0 - vadd.i64 $D4,$D4,$T0 @ h3 -> h4 - vbic.i32 $D3#lo,#0xfc000000 @ &=0x03ffffff - vadd.i64 $D1,$D1,$T1 @ h0 -> h1 - vbic.i32 $D0#lo,#0xfc000000 - - vshrn.u64 $T0#lo,$D4,#26 - vmovn.i64 $D4#lo,$D4 - vshr.u64 $T1,$D1,#26 - vmovn.i64 $D1#lo,$D1 - vadd.i64 $D2,$D2,$T1 @ h1 -> h2 - vbic.i32 $D4#lo,#0xfc000000 - vbic.i32 $D1#lo,#0xfc000000 - - vadd.i32 $D0#lo,$D0#lo,$T0#lo - vshl.u32 $T0#lo,$T0#lo,#2 - vshrn.u64 $T1#lo,$D2,#26 - vmovn.i64 $D2#lo,$D2 - vadd.i32 $D0#lo,$D0#lo,$T0#lo @ h4 -> h0 - vadd.i32 $D3#lo,$D3#lo,$T1#lo @ h2 -> h3 - vbic.i32 $D2#lo,#0xfc000000 - - vshr.u32 $T0#lo,$D0#lo,#26 - vbic.i32 $D0#lo,#0xfc000000 - vshr.u32 $T1#lo,$D3#lo,#26 - vbic.i32 $D3#lo,#0xfc000000 - vadd.i32 $D1#lo,$D1#lo,$T0#lo @ h0 -> h1 - vadd.i32 $D4#lo,$D4#lo,$T1#lo @ h3 -> h4 - - subs $zeros,$zeros,#1 - beq .Lsquare_break_neon - - add $tbl0,$ctx,#(48+0*9*4) - add $tbl1,$ctx,#(48+1*9*4) - - vtrn.32 $R0,$D0#lo @ r^2:r^1 - vtrn.32 $R2,$D2#lo - vtrn.32 $R3,$D3#lo - vtrn.32 $R1,$D1#lo - vtrn.32 $R4,$D4#lo - - vshl.u32 $S2,$R2,#2 @ *5 - vshl.u32 $S3,$R3,#2 - vshl.u32 $S1,$R1,#2 - vshl.u32 $S4,$R4,#2 - vadd.i32 $S2,$S2,$R2 - vadd.i32 $S1,$S1,$R1 - vadd.i32 $S3,$S3,$R3 - vadd.i32 $S4,$S4,$R4 - - vst4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! - vst4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! - vst4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]! - vst4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]! - vst1.32 {${S4}[0]},[$tbl0,:32] - vst1.32 {${S4}[1]},[$tbl1,:32] - - b .Lsquare_neon - -.align 4 -.Lsquare_break_neon: - add $tbl0,$ctx,#(48+2*4*9) - add $tbl1,$ctx,#(48+3*4*9) - - vmov $R0,$D0#lo @ r^4:r^3 - vshl.u32 $S1,$D1#lo,#2 @ *5 - vmov $R1,$D1#lo - vshl.u32 $S2,$D2#lo,#2 - vmov $R2,$D2#lo - vshl.u32 $S3,$D3#lo,#2 - vmov $R3,$D3#lo - vshl.u32 $S4,$D4#lo,#2 - vmov $R4,$D4#lo - vadd.i32 $S1,$S1,$D1#lo - vadd.i32 $S2,$S2,$D2#lo - vadd.i32 $S3,$S3,$D3#lo - vadd.i32 $S4,$S4,$D4#lo - - vst4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! - vst4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! - vst4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]! - vst4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]! - vst1.32 {${S4}[0]},[$tbl0] - vst1.32 {${S4}[1]},[$tbl1] - - ret @ bx lr -.size poly1305_init_neon,.-poly1305_init_neon - -.type poly1305_blocks_neon,%function -.align 5 -poly1305_blocks_neon: - ldr ip,[$ctx,#36] @ is_base2_26 - ands $len,$len,#-16 - beq .Lno_data_neon - - cmp $len,#64 - bhs .Lenter_neon - tst ip,ip @ is_base2_26? - beq GFp_poly1305_blocks - -.Lenter_neon: - stmdb sp!,{r4-r7} - vstmdb sp!,{d8-d15} @ ABI specification says so - - tst ip,ip @ is_base2_26? - bne .Lbase2_26_neon - - stmdb sp!,{r1-r3,lr} - bl poly1305_init_neon - - ldr r4,[$ctx,#0] @ load hash value base 2^32 - ldr r5,[$ctx,#4] - ldr r6,[$ctx,#8] - ldr r7,[$ctx,#12] - ldr ip,[$ctx,#16] - - and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26 - mov r3,r4,lsr#26 - veor $D0#lo,$D0#lo,$D0#lo - mov r4,r5,lsr#20 - orr r3,r3,r5,lsl#6 - veor $D1#lo,$D1#lo,$D1#lo - mov r5,r6,lsr#14 - orr r4,r4,r6,lsl#12 - veor $D2#lo,$D2#lo,$D2#lo - mov r6,r7,lsr#8 - orr r5,r5,r7,lsl#18 - veor $D3#lo,$D3#lo,$D3#lo - and r3,r3,#0x03ffffff - orr r6,r6,ip,lsl#24 - veor $D4#lo,$D4#lo,$D4#lo - and r4,r4,#0x03ffffff - mov r1,#1 - and r5,r5,#0x03ffffff - str r1,[$ctx,#36] @ is_base2_26 - - vmov.32 $D0#lo[0],r2 - vmov.32 $D1#lo[0],r3 - vmov.32 $D2#lo[0],r4 - vmov.32 $D3#lo[0],r5 - vmov.32 $D4#lo[0],r6 - adr $zeros,.Lzeros - - ldmia sp!,{r1-r3,lr} - b .Lbase2_32_neon - -.align 4 -.Lbase2_26_neon: - @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ - @ load hash value - - veor $D0#lo,$D0#lo,$D0#lo - veor $D1#lo,$D1#lo,$D1#lo - veor $D2#lo,$D2#lo,$D2#lo - veor $D3#lo,$D3#lo,$D3#lo - veor $D4#lo,$D4#lo,$D4#lo - vld4.32 {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]! - adr $zeros,.Lzeros - vld1.32 {$D4#lo[0]},[$ctx] - sub $ctx,$ctx,#16 @ rewind - -.Lbase2_32_neon: - add $in2,$inp,#32 - mov $padbit,$padbit,lsl#24 - tst $len,#31 - beq .Leven - - vld4.32 {$H0#lo[0],$H1#lo[0],$H2#lo[0],$H3#lo[0]},[$inp]! - vmov.32 $H4#lo[0],$padbit - sub $len,$len,#16 - add $in2,$inp,#32 - -# ifdef __ARMEB__ - vrev32.8 $H0,$H0 - vrev32.8 $H3,$H3 - vrev32.8 $H1,$H1 - vrev32.8 $H2,$H2 -# endif - vsri.u32 $H4#lo,$H3#lo,#8 @ base 2^32 -> base 2^26 - vshl.u32 $H3#lo,$H3#lo,#18 - - vsri.u32 $H3#lo,$H2#lo,#14 - vshl.u32 $H2#lo,$H2#lo,#12 - vadd.i32 $H4#hi,$H4#lo,$D4#lo @ add hash value and move to #hi - - vbic.i32 $H3#lo,#0xfc000000 - vsri.u32 $H2#lo,$H1#lo,#20 - vshl.u32 $H1#lo,$H1#lo,#6 - - vbic.i32 $H2#lo,#0xfc000000 - vsri.u32 $H1#lo,$H0#lo,#26 - vadd.i32 $H3#hi,$H3#lo,$D3#lo - - vbic.i32 $H0#lo,#0xfc000000 - vbic.i32 $H1#lo,#0xfc000000 - vadd.i32 $H2#hi,$H2#lo,$D2#lo - - vadd.i32 $H0#hi,$H0#lo,$D0#lo - vadd.i32 $H1#hi,$H1#lo,$D1#lo - - mov $tbl1,$zeros - add $tbl0,$ctx,#48 - - cmp $len,$len - b .Long_tail - -.align 4 -.Leven: - subs $len,$len,#64 - it lo - movlo $in2,$zeros - - vmov.i32 $H4,#1<<24 @ padbit, yes, always - vld4.32 {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp] @ inp[0:1] - add $inp,$inp,#64 - vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0) - add $in2,$in2,#64 - itt hi - addhi $tbl1,$ctx,#(48+1*9*4) - addhi $tbl0,$ctx,#(48+3*9*4) - -# ifdef __ARMEB__ - vrev32.8 $H0,$H0 - vrev32.8 $H3,$H3 - vrev32.8 $H1,$H1 - vrev32.8 $H2,$H2 -# endif - vsri.u32 $H4,$H3,#8 @ base 2^32 -> base 2^26 - vshl.u32 $H3,$H3,#18 - - vsri.u32 $H3,$H2,#14 - vshl.u32 $H2,$H2,#12 - - vbic.i32 $H3,#0xfc000000 - vsri.u32 $H2,$H1,#20 - vshl.u32 $H1,$H1,#6 - - vbic.i32 $H2,#0xfc000000 - vsri.u32 $H1,$H0,#26 - - vbic.i32 $H0,#0xfc000000 - vbic.i32 $H1,#0xfc000000 - - bls .Lskip_loop - - vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^2 - vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^4 - vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]! - vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]! - b .Loop_neon - -.align 5 -.Loop_neon: - @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ - @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 - @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r - @ \___________________/ - @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 - @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r - @ \___________________/ \____________________/ - @ - @ Note that we start with inp[2:3]*r^2. This is because it - @ doesn't depend on reduction in previous iteration. - @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ - @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 - @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 - @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 - @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 - @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 - - @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ - @ inp[2:3]*r^2 - - vadd.i32 $H2#lo,$H2#lo,$D2#lo @ accumulate inp[0:1] - vmull.u32 $D2,$H2#hi,${R0}[1] - vadd.i32 $H0#lo,$H0#lo,$D0#lo - vmull.u32 $D0,$H0#hi,${R0}[1] - vadd.i32 $H3#lo,$H3#lo,$D3#lo - vmull.u32 $D3,$H3#hi,${R0}[1] - vmlal.u32 $D2,$H1#hi,${R1}[1] - vadd.i32 $H1#lo,$H1#lo,$D1#lo - vmull.u32 $D1,$H1#hi,${R0}[1] - - vadd.i32 $H4#lo,$H4#lo,$D4#lo - vmull.u32 $D4,$H4#hi,${R0}[1] - subs $len,$len,#64 - vmlal.u32 $D0,$H4#hi,${S1}[1] - it lo - movlo $in2,$zeros - vmlal.u32 $D3,$H2#hi,${R1}[1] - vld1.32 ${S4}[1],[$tbl1,:32] - vmlal.u32 $D1,$H0#hi,${R1}[1] - vmlal.u32 $D4,$H3#hi,${R1}[1] - - vmlal.u32 $D0,$H3#hi,${S2}[1] - vmlal.u32 $D3,$H1#hi,${R2}[1] - vmlal.u32 $D4,$H2#hi,${R2}[1] - vmlal.u32 $D1,$H4#hi,${S2}[1] - vmlal.u32 $D2,$H0#hi,${R2}[1] - - vmlal.u32 $D3,$H0#hi,${R3}[1] - vmlal.u32 $D0,$H2#hi,${S3}[1] - vmlal.u32 $D4,$H1#hi,${R3}[1] - vmlal.u32 $D1,$H3#hi,${S3}[1] - vmlal.u32 $D2,$H4#hi,${S3}[1] - - vmlal.u32 $D3,$H4#hi,${S4}[1] - vmlal.u32 $D0,$H1#hi,${S4}[1] - vmlal.u32 $D4,$H0#hi,${R4}[1] - vmlal.u32 $D1,$H2#hi,${S4}[1] - vmlal.u32 $D2,$H3#hi,${S4}[1] - - vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0) - add $in2,$in2,#64 - - @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ - @ (hash+inp[0:1])*r^4 and accumulate - - vmlal.u32 $D3,$H3#lo,${R0}[0] - vmlal.u32 $D0,$H0#lo,${R0}[0] - vmlal.u32 $D4,$H4#lo,${R0}[0] - vmlal.u32 $D1,$H1#lo,${R0}[0] - vmlal.u32 $D2,$H2#lo,${R0}[0] - vld1.32 ${S4}[0],[$tbl0,:32] - - vmlal.u32 $D3,$H2#lo,${R1}[0] - vmlal.u32 $D0,$H4#lo,${S1}[0] - vmlal.u32 $D4,$H3#lo,${R1}[0] - vmlal.u32 $D1,$H0#lo,${R1}[0] - vmlal.u32 $D2,$H1#lo,${R1}[0] - - vmlal.u32 $D3,$H1#lo,${R2}[0] - vmlal.u32 $D0,$H3#lo,${S2}[0] - vmlal.u32 $D4,$H2#lo,${R2}[0] - vmlal.u32 $D1,$H4#lo,${S2}[0] - vmlal.u32 $D2,$H0#lo,${R2}[0] - - vmlal.u32 $D3,$H0#lo,${R3}[0] - vmlal.u32 $D0,$H2#lo,${S3}[0] - vmlal.u32 $D4,$H1#lo,${R3}[0] - vmlal.u32 $D1,$H3#lo,${S3}[0] - vmlal.u32 $D3,$H4#lo,${S4}[0] - - vmlal.u32 $D2,$H4#lo,${S3}[0] - vmlal.u32 $D0,$H1#lo,${S4}[0] - vmlal.u32 $D4,$H0#lo,${R4}[0] - vmov.i32 $H4,#1<<24 @ padbit, yes, always - vmlal.u32 $D1,$H2#lo,${S4}[0] - vmlal.u32 $D2,$H3#lo,${S4}[0] - - vld4.32 {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp] @ inp[0:1] - add $inp,$inp,#64 -# ifdef __ARMEB__ - vrev32.8 $H0,$H0 - vrev32.8 $H1,$H1 - vrev32.8 $H2,$H2 - vrev32.8 $H3,$H3 -# endif - - @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ - @ lazy reduction interleaved with base 2^32 -> base 2^26 of - @ inp[0:3] previously loaded to $H0-$H3 and smashed to $H0-$H4. - - vshr.u64 $T0,$D3,#26 - vmovn.i64 $D3#lo,$D3 - vshr.u64 $T1,$D0,#26 - vmovn.i64 $D0#lo,$D0 - vadd.i64 $D4,$D4,$T0 @ h3 -> h4 - vbic.i32 $D3#lo,#0xfc000000 - vsri.u32 $H4,$H3,#8 @ base 2^32 -> base 2^26 - vadd.i64 $D1,$D1,$T1 @ h0 -> h1 - vshl.u32 $H3,$H3,#18 - vbic.i32 $D0#lo,#0xfc000000 - - vshrn.u64 $T0#lo,$D4,#26 - vmovn.i64 $D4#lo,$D4 - vshr.u64 $T1,$D1,#26 - vmovn.i64 $D1#lo,$D1 - vadd.i64 $D2,$D2,$T1 @ h1 -> h2 - vsri.u32 $H3,$H2,#14 - vbic.i32 $D4#lo,#0xfc000000 - vshl.u32 $H2,$H2,#12 - vbic.i32 $D1#lo,#0xfc000000 - - vadd.i32 $D0#lo,$D0#lo,$T0#lo - vshl.u32 $T0#lo,$T0#lo,#2 - vbic.i32 $H3,#0xfc000000 - vshrn.u64 $T1#lo,$D2,#26 - vmovn.i64 $D2#lo,$D2 - vaddl.u32 $D0,$D0#lo,$T0#lo @ h4 -> h0 [widen for a sec] - vsri.u32 $H2,$H1,#20 - vadd.i32 $D3#lo,$D3#lo,$T1#lo @ h2 -> h3 - vshl.u32 $H1,$H1,#6 - vbic.i32 $D2#lo,#0xfc000000 - vbic.i32 $H2,#0xfc000000 - - vshrn.u64 $T0#lo,$D0,#26 @ re-narrow - vmovn.i64 $D0#lo,$D0 - vsri.u32 $H1,$H0,#26 - vbic.i32 $H0,#0xfc000000 - vshr.u32 $T1#lo,$D3#lo,#26 - vbic.i32 $D3#lo,#0xfc000000 - vbic.i32 $D0#lo,#0xfc000000 - vadd.i32 $D1#lo,$D1#lo,$T0#lo @ h0 -> h1 - vadd.i32 $D4#lo,$D4#lo,$T1#lo @ h3 -> h4 - vbic.i32 $H1,#0xfc000000 - - bhi .Loop_neon - -.Lskip_loop: - @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ - @ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 - - add $tbl1,$ctx,#(48+0*9*4) - add $tbl0,$ctx,#(48+1*9*4) - adds $len,$len,#32 - it ne - movne $len,#0 - bne .Long_tail - - vadd.i32 $H2#hi,$H2#lo,$D2#lo @ add hash value and move to #hi - vadd.i32 $H0#hi,$H0#lo,$D0#lo - vadd.i32 $H3#hi,$H3#lo,$D3#lo - vadd.i32 $H1#hi,$H1#lo,$D1#lo - vadd.i32 $H4#hi,$H4#lo,$D4#lo - -.Long_tail: - vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^1 - vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^2 - - vadd.i32 $H2#lo,$H2#lo,$D2#lo @ can be redundant - vmull.u32 $D2,$H2#hi,$R0 - vadd.i32 $H0#lo,$H0#lo,$D0#lo - vmull.u32 $D0,$H0#hi,$R0 - vadd.i32 $H3#lo,$H3#lo,$D3#lo - vmull.u32 $D3,$H3#hi,$R0 - vadd.i32 $H1#lo,$H1#lo,$D1#lo - vmull.u32 $D1,$H1#hi,$R0 - vadd.i32 $H4#lo,$H4#lo,$D4#lo - vmull.u32 $D4,$H4#hi,$R0 - - vmlal.u32 $D0,$H4#hi,$S1 - vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]! - vmlal.u32 $D3,$H2#hi,$R1 - vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]! - vmlal.u32 $D1,$H0#hi,$R1 - vmlal.u32 $D4,$H3#hi,$R1 - vmlal.u32 $D2,$H1#hi,$R1 - - vmlal.u32 $D3,$H1#hi,$R2 - vld1.32 ${S4}[1],[$tbl1,:32] - vmlal.u32 $D0,$H3#hi,$S2 - vld1.32 ${S4}[0],[$tbl0,:32] - vmlal.u32 $D4,$H2#hi,$R2 - vmlal.u32 $D1,$H4#hi,$S2 - vmlal.u32 $D2,$H0#hi,$R2 - - vmlal.u32 $D3,$H0#hi,$R3 - it ne - addne $tbl1,$ctx,#(48+2*9*4) - vmlal.u32 $D0,$H2#hi,$S3 - it ne - addne $tbl0,$ctx,#(48+3*9*4) - vmlal.u32 $D4,$H1#hi,$R3 - vmlal.u32 $D1,$H3#hi,$S3 - vmlal.u32 $D2,$H4#hi,$S3 - - vmlal.u32 $D3,$H4#hi,$S4 - vorn $MASK,$MASK,$MASK @ all-ones, can be redundant - vmlal.u32 $D0,$H1#hi,$S4 - vshr.u64 $MASK,$MASK,#38 - vmlal.u32 $D4,$H0#hi,$R4 - vmlal.u32 $D1,$H2#hi,$S4 - vmlal.u32 $D2,$H3#hi,$S4 - - beq .Lshort_tail - - @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ - @ (hash+inp[0:1])*r^4:r^3 and accumulate - - vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^3 - vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^4 - - vmlal.u32 $D2,$H2#lo,$R0 - vmlal.u32 $D0,$H0#lo,$R0 - vmlal.u32 $D3,$H3#lo,$R0 - vmlal.u32 $D1,$H1#lo,$R0 - vmlal.u32 $D4,$H4#lo,$R0 - - vmlal.u32 $D0,$H4#lo,$S1 - vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]! - vmlal.u32 $D3,$H2#lo,$R1 - vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]! - vmlal.u32 $D1,$H0#lo,$R1 - vmlal.u32 $D4,$H3#lo,$R1 - vmlal.u32 $D2,$H1#lo,$R1 - - vmlal.u32 $D3,$H1#lo,$R2 - vld1.32 ${S4}[1],[$tbl1,:32] - vmlal.u32 $D0,$H3#lo,$S2 - vld1.32 ${S4}[0],[$tbl0,:32] - vmlal.u32 $D4,$H2#lo,$R2 - vmlal.u32 $D1,$H4#lo,$S2 - vmlal.u32 $D2,$H0#lo,$R2 - - vmlal.u32 $D3,$H0#lo,$R3 - vmlal.u32 $D0,$H2#lo,$S3 - vmlal.u32 $D4,$H1#lo,$R3 - vmlal.u32 $D1,$H3#lo,$S3 - vmlal.u32 $D2,$H4#lo,$S3 - - vmlal.u32 $D3,$H4#lo,$S4 - vorn $MASK,$MASK,$MASK @ all-ones - vmlal.u32 $D0,$H1#lo,$S4 - vshr.u64 $MASK,$MASK,#38 - vmlal.u32 $D4,$H0#lo,$R4 - vmlal.u32 $D1,$H2#lo,$S4 - vmlal.u32 $D2,$H3#lo,$S4 - -.Lshort_tail: - @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ - @ horizontal addition - - vadd.i64 $D3#lo,$D3#lo,$D3#hi - vadd.i64 $D0#lo,$D0#lo,$D0#hi - vadd.i64 $D4#lo,$D4#lo,$D4#hi - vadd.i64 $D1#lo,$D1#lo,$D1#hi - vadd.i64 $D2#lo,$D2#lo,$D2#hi - - @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ - @ lazy reduction, but without narrowing - - vshr.u64 $T0,$D3,#26 - vand.i64 $D3,$D3,$MASK - vshr.u64 $T1,$D0,#26 - vand.i64 $D0,$D0,$MASK - vadd.i64 $D4,$D4,$T0 @ h3 -> h4 - vadd.i64 $D1,$D1,$T1 @ h0 -> h1 - - vshr.u64 $T0,$D4,#26 - vand.i64 $D4,$D4,$MASK - vshr.u64 $T1,$D1,#26 - vand.i64 $D1,$D1,$MASK - vadd.i64 $D2,$D2,$T1 @ h1 -> h2 - - vadd.i64 $D0,$D0,$T0 - vshl.u64 $T0,$T0,#2 - vshr.u64 $T1,$D2,#26 - vand.i64 $D2,$D2,$MASK - vadd.i64 $D0,$D0,$T0 @ h4 -> h0 - vadd.i64 $D3,$D3,$T1 @ h2 -> h3 - - vshr.u64 $T0,$D0,#26 - vand.i64 $D0,$D0,$MASK - vshr.u64 $T1,$D3,#26 - vand.i64 $D3,$D3,$MASK - vadd.i64 $D1,$D1,$T0 @ h0 -> h1 - vadd.i64 $D4,$D4,$T1 @ h3 -> h4 - - cmp $len,#0 - bne .Leven - - @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ - @ store hash value - - vst4.32 {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]! - vst1.32 {$D4#lo[0]},[$ctx] - - vldmia sp!,{d8-d15} @ epilogue - ldmia sp!,{r4-r7} -.Lno_data_neon: - ret @ bx lr -.size poly1305_blocks_neon,.-poly1305_blocks_neon - -.type poly1305_emit_neon,%function -.align 5 -poly1305_emit_neon: - ldr ip,[$ctx,#36] @ is_base2_26 - - stmdb sp!,{r4-r11} - - tst ip,ip - beq .Lpoly1305_emit_enter - - ldmia $ctx,{$h0-$h4} - eor $g0,$g0,$g0 - - adds $h0,$h0,$h1,lsl#26 @ base 2^26 -> base 2^32 - mov $h1,$h1,lsr#6 - adcs $h1,$h1,$h2,lsl#20 - mov $h2,$h2,lsr#12 - adcs $h2,$h2,$h3,lsl#14 - mov $h3,$h3,lsr#18 - adcs $h3,$h3,$h4,lsl#8 - adc $h4,$g0,$h4,lsr#24 @ can be partially reduced ... - - and $g0,$h4,#-4 @ ... so reduce - and $h4,$h3,#3 - add $g0,$g0,$g0,lsr#2 @ *= 5 - adds $h0,$h0,$g0 - adcs $h1,$h1,#0 - adcs $h2,$h2,#0 - adcs $h3,$h3,#0 - adc $h4,$h4,#0 - - adds $g0,$h0,#5 @ compare to modulus - adcs $g1,$h1,#0 - adcs $g2,$h2,#0 - adcs $g3,$h3,#0 - adc $g4,$h4,#0 - tst $g4,#4 @ did it carry/borrow? - - it ne - movne $h0,$g0 - ldr $g0,[$nonce,#0] - it ne - movne $h1,$g1 - ldr $g1,[$nonce,#4] - it ne - movne $h2,$g2 - ldr $g2,[$nonce,#8] - it ne - movne $h3,$g3 - ldr $g3,[$nonce,#12] - - adds $h0,$h0,$g0 @ accumulate nonce - adcs $h1,$h1,$g1 - adcs $h2,$h2,$g2 - adc $h3,$h3,$g3 - -# ifdef __ARMEB__ - rev $h0,$h0 - rev $h1,$h1 - rev $h2,$h2 - rev $h3,$h3 -# endif - str $h0,[$mac,#0] @ store the result - str $h1,[$mac,#4] - str $h2,[$mac,#8] - str $h3,[$mac,#12] - - ldmia sp!,{r4-r11} - ret @ bx lr -.size poly1305_emit_neon,.-poly1305_emit_neon - -.align 5 -.Lzeros: -.long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 -.LOPENSSL_armcap: -.word GFp_armcap_P-.Lpoly1305_init -#endif -___ -} } -$code.=<<___; -.asciz "Poly1305 for ARMv4/NEON, CRYPTOGAMS by " -.align 2 -#if __ARM_MAX_ARCH__>=7 -.comm GFp_armcap_P,4,4 -#endif -___ - -foreach (split("\n",$code)) { - s/\`([^\`]*)\`/eval $1/geo; - - s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or - s/\bret\b/bx lr/go or - s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4 - - print $_,"\n"; -} -close STDOUT or die "error closing STDOUT"; diff --git a/crypto/poly1305/asm/poly1305-armv8.pl b/crypto/poly1305/asm/poly1305-armv8.pl deleted file mode 100755 index 82aee20c11..0000000000 --- a/crypto/poly1305/asm/poly1305-armv8.pl +++ /dev/null @@ -1,931 +0,0 @@ -#!/usr/bin/env perl -# -# ==================================================================== -# Written by Andy Polyakov for the OpenSSL -# project. The module is, however, dual licensed under OpenSSL and -# CRYPTOGAMS licenses depending on where you obtain it. For further -# details see http://www.openssl.org/~appro/cryptogams/. -# ==================================================================== -# -# This module implements Poly1305 hash for ARMv8. -# -# June 2015 -# -# Numbers are cycles per processed byte with GFp_poly1305_blocks alone. -# -# IALU/gcc-4.9 NEON -# -# Apple A7 1.86/+5% 0.72 -# Cortex-A53 2.69/+58% 1.47 -# Cortex-A57 2.70/+7% 1.14 -# Denver 1.64/+50% 1.18(*) -# X-Gene 2.13/+68% 2.27 -# -# (*) estimate based on resources availability is less than 1.0, -# i.e. measured result is worse than expected, presumably binary -# translator is not almighty; - -$flavour=shift; -$output=shift; - -$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; -( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or -( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or -die "can't locate arm-xlate.pl"; - -open OUT,"| \"$^X\" $xlate $flavour $output"; -*STDOUT=*OUT; - -my ($ctx,$inp,$len,$padbit) = map("x$_",(0..3)); -my ($mac,$nonce)=($inp,$len); - -my ($h0,$h1,$h2,$r0,$r1,$s1,$t0,$t1,$d0,$d1,$d2) = map("x$_",(4..14)); - -$code.=<<___; -#include - -.text - -// forward "declarations" are required for Apple -.extern GFp_armcap_P -.globl GFp_poly1305_blocks -.globl GFp_poly1305_emit -.globl GFp_poly1305_init_asm - -.type GFp_poly1305_init_asm,%function -.align 5 -GFp_poly1305_init_asm: - cmp $inp,xzr - stp xzr,xzr,[$ctx] // zero hash value - stp xzr,xzr,[$ctx,#16] // [along with is_base2_26] - - csel x0,xzr,x0,eq - b.eq .Lno_key - -#ifdef __ILP32__ - ldrsw $t1,.LGFp_armcap_P -#else - ldr $t1,.LGFp_armcap_P -#endif - adr $t0,.LGFp_armcap_P - - ldp $r0,$r1,[$inp] // load key - mov $s1,#0xfffffffc0fffffff - movk $s1,#0x0fff,lsl#48 - ldr w17,[$t0,$t1] -#ifdef __ARMEB__ - rev $r0,$r0 // flip bytes - rev $r1,$r1 -#endif - and $r0,$r0,$s1 // &=0ffffffc0fffffff - and $s1,$s1,#-4 - and $r1,$r1,$s1 // &=0ffffffc0ffffffc - stp $r0,$r1,[$ctx,#32] // save key value - - tst w17,#ARMV7_NEON - - adr $d0,GFp_poly1305_blocks - adr $r0,poly1305_blocks_neon - adr $d1,GFp_poly1305_emit - adr $r1,poly1305_emit_neon - - csel $d0,$d0,$r0,eq - csel $d1,$d1,$r1,eq - - stp $d0,$d1,[$len] - - mov x0,#1 -.Lno_key: - ret -.size GFp_poly1305_init_asm,.-GFp_poly1305_init_asm - -.type GFp_poly1305_blocks,%function -.align 5 -GFp_poly1305_blocks: - ands $len,$len,#-16 - b.eq .Lno_data - - ldp $h0,$h1,[$ctx] // load hash value - ldp $r0,$r1,[$ctx,#32] // load key value - ldr $h2,[$ctx,#16] - add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2) - b .Loop - -.align 5 -.Loop: - ldp $t0,$t1,[$inp],#16 // load input - sub $len,$len,#16 -#ifdef __ARMEB__ - rev $t0,$t0 - rev $t1,$t1 -#endif - adds $h0,$h0,$t0 // accumulate input - adcs $h1,$h1,$t1 - - mul $d0,$h0,$r0 // h0*r0 - adc $h2,$h2,$padbit - umulh $d1,$h0,$r0 - - mul $t0,$h1,$s1 // h1*5*r1 - umulh $t1,$h1,$s1 - - adds $d0,$d0,$t0 - mul $t0,$h0,$r1 // h0*r1 - adc $d1,$d1,$t1 - umulh $d2,$h0,$r1 - - adds $d1,$d1,$t0 - mul $t0,$h1,$r0 // h1*r0 - adc $d2,$d2,xzr - umulh $t1,$h1,$r0 - - adds $d1,$d1,$t0 - mul $t0,$h2,$s1 // h2*5*r1 - adc $d2,$d2,$t1 - mul $t1,$h2,$r0 // h2*r0 - - adds $d1,$d1,$t0 - adc $d2,$d2,$t1 - - and $t0,$d2,#-4 // final reduction - and $h2,$d2,#3 - add $t0,$t0,$d2,lsr#2 - adds $h0,$d0,$t0 - adcs $h1,$d1,xzr - adc $h2,$h2,xzr - - cbnz $len,.Loop - - stp $h0,$h1,[$ctx] // store hash value - str $h2,[$ctx,#16] - -.Lno_data: - ret -.size GFp_poly1305_blocks,.-GFp_poly1305_blocks - -.type GFp_poly1305_emit,%function -.align 5 -GFp_poly1305_emit: - ldp $h0,$h1,[$ctx] // load hash base 2^64 - ldr $h2,[$ctx,#16] - ldp $t0,$t1,[$nonce] // load nonce - - adds $d0,$h0,#5 // compare to modulus - adcs $d1,$h1,xzr - adc $d2,$h2,xzr - - tst $d2,#-4 // see if it's carried/borrowed - - csel $h0,$h0,$d0,eq - csel $h1,$h1,$d1,eq - -#ifdef __ARMEB__ - ror $t0,$t0,#32 // flip nonce words - ror $t1,$t1,#32 -#endif - adds $h0,$h0,$t0 // accumulate nonce - adc $h1,$h1,$t1 -#ifdef __ARMEB__ - rev $h0,$h0 // flip output bytes - rev $h1,$h1 -#endif - stp $h0,$h1,[$mac] // write result - - ret -.size GFp_poly1305_emit,.-GFp_poly1305_emit -___ -my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("v$_.4s",(0..8)); -my ($IN01_0,$IN01_1,$IN01_2,$IN01_3,$IN01_4) = map("v$_.2s",(9..13)); -my ($IN23_0,$IN23_1,$IN23_2,$IN23_3,$IN23_4) = map("v$_.2s",(14..18)); -my ($ACC0,$ACC1,$ACC2,$ACC3,$ACC4) = map("v$_.2d",(19..23)); -my ($H0,$H1,$H2,$H3,$H4) = map("v$_.2s",(24..28)); -my ($T0,$T1,$MASK) = map("v$_",(29..31)); - -my ($in2,$zeros)=("x16","x17"); -my $is_base2_26 = $zeros; # borrow - -$code.=<<___; -.type poly1305_mult,%function -.align 5 -poly1305_mult: - mul $d0,$h0,$r0 // h0*r0 - umulh $d1,$h0,$r0 - - mul $t0,$h1,$s1 // h1*5*r1 - umulh $t1,$h1,$s1 - - adds $d0,$d0,$t0 - mul $t0,$h0,$r1 // h0*r1 - adc $d1,$d1,$t1 - umulh $d2,$h0,$r1 - - adds $d1,$d1,$t0 - mul $t0,$h1,$r0 // h1*r0 - adc $d2,$d2,xzr - umulh $t1,$h1,$r0 - - adds $d1,$d1,$t0 - mul $t0,$h2,$s1 // h2*5*r1 - adc $d2,$d2,$t1 - mul $t1,$h2,$r0 // h2*r0 - - adds $d1,$d1,$t0 - adc $d2,$d2,$t1 - - and $t0,$d2,#-4 // final reduction - and $h2,$d2,#3 - add $t0,$t0,$d2,lsr#2 - adds $h0,$d0,$t0 - adcs $h1,$d1,xzr - adc $h2,$h2,xzr - - ret -.size poly1305_mult,.-poly1305_mult - -.type poly1305_splat,%function -.align 5 -poly1305_splat: - and x12,$h0,#0x03ffffff // base 2^64 -> base 2^26 - ubfx x13,$h0,#26,#26 - extr x14,$h1,$h0,#52 - and x14,x14,#0x03ffffff - ubfx x15,$h1,#14,#26 - extr x16,$h2,$h1,#40 - - str w12,[$ctx,#16*0] // r0 - add w12,w13,w13,lsl#2 // r1*5 - str w13,[$ctx,#16*1] // r1 - add w13,w14,w14,lsl#2 // r2*5 - str w12,[$ctx,#16*2] // s1 - str w14,[$ctx,#16*3] // r2 - add w14,w15,w15,lsl#2 // r3*5 - str w13,[$ctx,#16*4] // s2 - str w15,[$ctx,#16*5] // r3 - add w15,w16,w16,lsl#2 // r4*5 - str w14,[$ctx,#16*6] // s3 - str w16,[$ctx,#16*7] // r4 - str w15,[$ctx,#16*8] // s4 - - ret -.size poly1305_splat,.-poly1305_splat - -.type poly1305_blocks_neon,%function -.align 5 -poly1305_blocks_neon: - ldr $is_base2_26,[$ctx,#24] - cmp $len,#128 - b.hs .Lblocks_neon - cbz $is_base2_26,GFp_poly1305_blocks - -.Lblocks_neon: - stp x29,x30,[sp,#-80]! - add x29,sp,#0 - - ands $len,$len,#-16 - b.eq .Lno_data_neon - - cbz $is_base2_26,.Lbase2_64_neon - - ldp w10,w11,[$ctx] // load hash value base 2^26 - ldp w12,w13,[$ctx,#8] - ldr w14,[$ctx,#16] - - tst $len,#31 - b.eq .Leven_neon - - ldp $r0,$r1,[$ctx,#32] // load key value - - add $h0,x10,x11,lsl#26 // base 2^26 -> base 2^64 - lsr $h1,x12,#12 - adds $h0,$h0,x12,lsl#52 - add $h1,$h1,x13,lsl#14 - adc $h1,$h1,xzr - lsr $h2,x14,#24 - adds $h1,$h1,x14,lsl#40 - adc $d2,$h2,xzr // can be partially reduced... - - ldp $d0,$d1,[$inp],#16 // load input - sub $len,$len,#16 - add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2) - - and $t0,$d2,#-4 // ... so reduce - and $h2,$d2,#3 - add $t0,$t0,$d2,lsr#2 - adds $h0,$h0,$t0 - adcs $h1,$h1,xzr - adc $h2,$h2,xzr - -#ifdef __ARMEB__ - rev $d0,$d0 - rev $d1,$d1 -#endif - adds $h0,$h0,$d0 // accumulate input - adcs $h1,$h1,$d1 - adc $h2,$h2,$padbit - - bl poly1305_mult - ldr x30,[sp,#8] - - cbz $padbit,.Lstore_base2_64_neon - - and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26 - ubfx x11,$h0,#26,#26 - extr x12,$h1,$h0,#52 - and x12,x12,#0x03ffffff - ubfx x13,$h1,#14,#26 - extr x14,$h2,$h1,#40 - - cbnz $len,.Leven_neon - - stp w10,w11,[$ctx] // store hash value base 2^26 - stp w12,w13,[$ctx,#8] - str w14,[$ctx,#16] - b .Lno_data_neon - -.align 4 -.Lstore_base2_64_neon: - stp $h0,$h1,[$ctx] // store hash value base 2^64 - stp $h2,xzr,[$ctx,#16] // note that is_base2_26 is zeroed - b .Lno_data_neon - -.align 4 -.Lbase2_64_neon: - ldp $r0,$r1,[$ctx,#32] // load key value - - ldp $h0,$h1,[$ctx] // load hash value base 2^64 - ldr $h2,[$ctx,#16] - - tst $len,#31 - b.eq .Linit_neon - - ldp $d0,$d1,[$inp],#16 // load input - sub $len,$len,#16 - add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2) -#ifdef __ARMEB__ - rev $d0,$d0 - rev $d1,$d1 -#endif - adds $h0,$h0,$d0 // accumulate input - adcs $h1,$h1,$d1 - adc $h2,$h2,$padbit - - bl poly1305_mult - -.Linit_neon: - and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26 - ubfx x11,$h0,#26,#26 - extr x12,$h1,$h0,#52 - and x12,x12,#0x03ffffff - ubfx x13,$h1,#14,#26 - extr x14,$h2,$h1,#40 - - stp d8,d9,[sp,#16] // meet ABI requirements - stp d10,d11,[sp,#32] - stp d12,d13,[sp,#48] - stp d14,d15,[sp,#64] - - fmov ${H0},x10 - fmov ${H1},x11 - fmov ${H2},x12 - fmov ${H3},x13 - fmov ${H4},x14 - - ////////////////////////////////// initialize r^n table - mov $h0,$r0 // r^1 - add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2) - mov $h1,$r1 - mov $h2,xzr - add $ctx,$ctx,#48+12 - bl poly1305_splat - - bl poly1305_mult // r^2 - sub $ctx,$ctx,#4 - bl poly1305_splat - - bl poly1305_mult // r^3 - sub $ctx,$ctx,#4 - bl poly1305_splat - - bl poly1305_mult // r^4 - sub $ctx,$ctx,#4 - bl poly1305_splat - ldr x30,[sp,#8] - - add $in2,$inp,#32 - adr $zeros,.Lzeros - subs $len,$len,#64 - csel $in2,$zeros,$in2,lo - - mov x4,#1 - str x4,[$ctx,#-24] // set is_base2_26 - sub $ctx,$ctx,#48 // restore original $ctx - b .Ldo_neon - -.align 4 -.Leven_neon: - add $in2,$inp,#32 - adr $zeros,.Lzeros - subs $len,$len,#64 - csel $in2,$zeros,$in2,lo - - stp d8,d9,[sp,#16] // meet ABI requirements - stp d10,d11,[sp,#32] - stp d12,d13,[sp,#48] - stp d14,d15,[sp,#64] - - fmov ${H0},x10 - fmov ${H1},x11 - fmov ${H2},x12 - fmov ${H3},x13 - fmov ${H4},x14 - -.Ldo_neon: - ldp x8,x12,[$in2],#16 // inp[2:3] (or zero) - ldp x9,x13,[$in2],#48 - - lsl $padbit,$padbit,#24 - add x15,$ctx,#48 - -#ifdef __ARMEB__ - rev x8,x8 - rev x12,x12 - rev x9,x9 - rev x13,x13 -#endif - and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 - and x5,x9,#0x03ffffff - ubfx x6,x8,#26,#26 - ubfx x7,x9,#26,#26 - add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 - extr x8,x12,x8,#52 - extr x9,x13,x9,#52 - add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 - fmov $IN23_0,x4 - and x8,x8,#0x03ffffff - and x9,x9,#0x03ffffff - ubfx x10,x12,#14,#26 - ubfx x11,x13,#14,#26 - add x12,$padbit,x12,lsr#40 - add x13,$padbit,x13,lsr#40 - add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 - fmov $IN23_1,x6 - add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 - add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 - fmov $IN23_2,x8 - fmov $IN23_3,x10 - fmov $IN23_4,x12 - - ldp x8,x12,[$inp],#16 // inp[0:1] - ldp x9,x13,[$inp],#48 - - ld1 {$R0,$R1,$S1,$R2},[x15],#64 - ld1 {$S2,$R3,$S3,$R4},[x15],#64 - ld1 {$S4},[x15] - -#ifdef __ARMEB__ - rev x8,x8 - rev x12,x12 - rev x9,x9 - rev x13,x13 -#endif - and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 - and x5,x9,#0x03ffffff - ubfx x6,x8,#26,#26 - ubfx x7,x9,#26,#26 - add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 - extr x8,x12,x8,#52 - extr x9,x13,x9,#52 - add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 - fmov $IN01_0,x4 - and x8,x8,#0x03ffffff - and x9,x9,#0x03ffffff - ubfx x10,x12,#14,#26 - ubfx x11,x13,#14,#26 - add x12,$padbit,x12,lsr#40 - add x13,$padbit,x13,lsr#40 - add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 - fmov $IN01_1,x6 - add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 - add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 - movi $MASK.2d,#-1 - fmov $IN01_2,x8 - fmov $IN01_3,x10 - fmov $IN01_4,x12 - ushr $MASK.2d,$MASK.2d,#38 - - b.ls .Lskip_loop - -.align 4 -.Loop_neon: - //////////////////////////////////////////////////////////////// - // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 - // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r - // \___________________/ - // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 - // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r - // \___________________/ \____________________/ - // - // Note that we start with inp[2:3]*r^2. This is because it - // doesn't depend on reduction in previous iteration. - //////////////////////////////////////////////////////////////// - // d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0 - // d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4 - // d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3 - // d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2 - // d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1 - - subs $len,$len,#64 - umull $ACC4,$IN23_0,${R4}[2] - csel $in2,$zeros,$in2,lo - umull $ACC3,$IN23_0,${R3}[2] - umull $ACC2,$IN23_0,${R2}[2] - ldp x8,x12,[$in2],#16 // inp[2:3] (or zero) - umull $ACC1,$IN23_0,${R1}[2] - ldp x9,x13,[$in2],#48 - umull $ACC0,$IN23_0,${R0}[2] -#ifdef __ARMEB__ - rev x8,x8 - rev x12,x12 - rev x9,x9 - rev x13,x13 -#endif - - umlal $ACC4,$IN23_1,${R3}[2] - and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 - umlal $ACC3,$IN23_1,${R2}[2] - and x5,x9,#0x03ffffff - umlal $ACC2,$IN23_1,${R1}[2] - ubfx x6,x8,#26,#26 - umlal $ACC1,$IN23_1,${R0}[2] - ubfx x7,x9,#26,#26 - umlal $ACC0,$IN23_1,${S4}[2] - add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 - - umlal $ACC4,$IN23_2,${R2}[2] - extr x8,x12,x8,#52 - umlal $ACC3,$IN23_2,${R1}[2] - extr x9,x13,x9,#52 - umlal $ACC2,$IN23_2,${R0}[2] - add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 - umlal $ACC1,$IN23_2,${S4}[2] - fmov $IN23_0,x4 - umlal $ACC0,$IN23_2,${S3}[2] - and x8,x8,#0x03ffffff - - umlal $ACC4,$IN23_3,${R1}[2] - and x9,x9,#0x03ffffff - umlal $ACC3,$IN23_3,${R0}[2] - ubfx x10,x12,#14,#26 - umlal $ACC2,$IN23_3,${S4}[2] - ubfx x11,x13,#14,#26 - umlal $ACC1,$IN23_3,${S3}[2] - add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 - umlal $ACC0,$IN23_3,${S2}[2] - fmov $IN23_1,x6 - - add $IN01_2,$IN01_2,$H2 - add x12,$padbit,x12,lsr#40 - umlal $ACC4,$IN23_4,${R0}[2] - add x13,$padbit,x13,lsr#40 - umlal $ACC3,$IN23_4,${S4}[2] - add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 - umlal $ACC2,$IN23_4,${S3}[2] - add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 - umlal $ACC1,$IN23_4,${S2}[2] - fmov $IN23_2,x8 - umlal $ACC0,$IN23_4,${S1}[2] - fmov $IN23_3,x10 - - //////////////////////////////////////////////////////////////// - // (hash+inp[0:1])*r^4 and accumulate - - add $IN01_0,$IN01_0,$H0 - fmov $IN23_4,x12 - umlal $ACC3,$IN01_2,${R1}[0] - ldp x8,x12,[$inp],#16 // inp[0:1] - umlal $ACC0,$IN01_2,${S3}[0] - ldp x9,x13,[$inp],#48 - umlal $ACC4,$IN01_2,${R2}[0] - umlal $ACC1,$IN01_2,${S4}[0] - umlal $ACC2,$IN01_2,${R0}[0] -#ifdef __ARMEB__ - rev x8,x8 - rev x12,x12 - rev x9,x9 - rev x13,x13 -#endif - - add $IN01_1,$IN01_1,$H1 - umlal $ACC3,$IN01_0,${R3}[0] - umlal $ACC4,$IN01_0,${R4}[0] - and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 - umlal $ACC2,$IN01_0,${R2}[0] - and x5,x9,#0x03ffffff - umlal $ACC0,$IN01_0,${R0}[0] - ubfx x6,x8,#26,#26 - umlal $ACC1,$IN01_0,${R1}[0] - ubfx x7,x9,#26,#26 - - add $IN01_3,$IN01_3,$H3 - add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 - umlal $ACC3,$IN01_1,${R2}[0] - extr x8,x12,x8,#52 - umlal $ACC4,$IN01_1,${R3}[0] - extr x9,x13,x9,#52 - umlal $ACC0,$IN01_1,${S4}[0] - add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 - umlal $ACC2,$IN01_1,${R1}[0] - fmov $IN01_0,x4 - umlal $ACC1,$IN01_1,${R0}[0] - and x8,x8,#0x03ffffff - - add $IN01_4,$IN01_4,$H4 - and x9,x9,#0x03ffffff - umlal $ACC3,$IN01_3,${R0}[0] - ubfx x10,x12,#14,#26 - umlal $ACC0,$IN01_3,${S2}[0] - ubfx x11,x13,#14,#26 - umlal $ACC4,$IN01_3,${R1}[0] - add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 - umlal $ACC1,$IN01_3,${S3}[0] - fmov $IN01_1,x6 - umlal $ACC2,$IN01_3,${S4}[0] - add x12,$padbit,x12,lsr#40 - - umlal $ACC3,$IN01_4,${S4}[0] - add x13,$padbit,x13,lsr#40 - umlal $ACC0,$IN01_4,${S1}[0] - add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 - umlal $ACC4,$IN01_4,${R0}[0] - add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 - umlal $ACC1,$IN01_4,${S2}[0] - fmov $IN01_2,x8 - umlal $ACC2,$IN01_4,${S3}[0] - fmov $IN01_3,x10 - fmov $IN01_4,x12 - - ///////////////////////////////////////////////////////////////// - // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein - // and P. Schwabe - // - // [see discussion in poly1305-armv4 module] - - ushr $T0.2d,$ACC3,#26 - xtn $H3,$ACC3 - ushr $T1.2d,$ACC0,#26 - and $ACC0,$ACC0,$MASK.2d - add $ACC4,$ACC4,$T0.2d // h3 -> h4 - bic $H3,#0xfc,lsl#24 // &=0x03ffffff - add $ACC1,$ACC1,$T1.2d // h0 -> h1 - - ushr $T0.2d,$ACC4,#26 - xtn $H4,$ACC4 - ushr $T1.2d,$ACC1,#26 - xtn $H1,$ACC1 - bic $H4,#0xfc,lsl#24 - add $ACC2,$ACC2,$T1.2d // h1 -> h2 - - add $ACC0,$ACC0,$T0.2d - shl $T0.2d,$T0.2d,#2 - shrn $T1.2s,$ACC2,#26 - xtn $H2,$ACC2 - add $ACC0,$ACC0,$T0.2d // h4 -> h0 - bic $H1,#0xfc,lsl#24 - add $H3,$H3,$T1.2s // h2 -> h3 - bic $H2,#0xfc,lsl#24 - - shrn $T0.2s,$ACC0,#26 - xtn $H0,$ACC0 - ushr $T1.2s,$H3,#26 - bic $H3,#0xfc,lsl#24 - bic $H0,#0xfc,lsl#24 - add $H1,$H1,$T0.2s // h0 -> h1 - add $H4,$H4,$T1.2s // h3 -> h4 - - b.hi .Loop_neon - -.Lskip_loop: - dup $IN23_2,${IN23_2}[0] - add $IN01_2,$IN01_2,$H2 - - //////////////////////////////////////////////////////////////// - // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 - - adds $len,$len,#32 - b.ne .Long_tail - - dup $IN23_2,${IN01_2}[0] - add $IN23_0,$IN01_0,$H0 - add $IN23_3,$IN01_3,$H3 - add $IN23_1,$IN01_1,$H1 - add $IN23_4,$IN01_4,$H4 - -.Long_tail: - dup $IN23_0,${IN23_0}[0] - umull2 $ACC0,$IN23_2,${S3} - umull2 $ACC3,$IN23_2,${R1} - umull2 $ACC4,$IN23_2,${R2} - umull2 $ACC2,$IN23_2,${R0} - umull2 $ACC1,$IN23_2,${S4} - - dup $IN23_1,${IN23_1}[0] - umlal2 $ACC0,$IN23_0,${R0} - umlal2 $ACC2,$IN23_0,${R2} - umlal2 $ACC3,$IN23_0,${R3} - umlal2 $ACC4,$IN23_0,${R4} - umlal2 $ACC1,$IN23_0,${R1} - - dup $IN23_3,${IN23_3}[0] - umlal2 $ACC0,$IN23_1,${S4} - umlal2 $ACC3,$IN23_1,${R2} - umlal2 $ACC2,$IN23_1,${R1} - umlal2 $ACC4,$IN23_1,${R3} - umlal2 $ACC1,$IN23_1,${R0} - - dup $IN23_4,${IN23_4}[0] - umlal2 $ACC3,$IN23_3,${R0} - umlal2 $ACC4,$IN23_3,${R1} - umlal2 $ACC0,$IN23_3,${S2} - umlal2 $ACC1,$IN23_3,${S3} - umlal2 $ACC2,$IN23_3,${S4} - - umlal2 $ACC3,$IN23_4,${S4} - umlal2 $ACC0,$IN23_4,${S1} - umlal2 $ACC4,$IN23_4,${R0} - umlal2 $ACC1,$IN23_4,${S2} - umlal2 $ACC2,$IN23_4,${S3} - - b.eq .Lshort_tail - - //////////////////////////////////////////////////////////////// - // (hash+inp[0:1])*r^4:r^3 and accumulate - - add $IN01_0,$IN01_0,$H0 - umlal $ACC3,$IN01_2,${R1} - umlal $ACC0,$IN01_2,${S3} - umlal $ACC4,$IN01_2,${R2} - umlal $ACC1,$IN01_2,${S4} - umlal $ACC2,$IN01_2,${R0} - - add $IN01_1,$IN01_1,$H1 - umlal $ACC3,$IN01_0,${R3} - umlal $ACC0,$IN01_0,${R0} - umlal $ACC4,$IN01_0,${R4} - umlal $ACC1,$IN01_0,${R1} - umlal $ACC2,$IN01_0,${R2} - - add $IN01_3,$IN01_3,$H3 - umlal $ACC3,$IN01_1,${R2} - umlal $ACC0,$IN01_1,${S4} - umlal $ACC4,$IN01_1,${R3} - umlal $ACC1,$IN01_1,${R0} - umlal $ACC2,$IN01_1,${R1} - - add $IN01_4,$IN01_4,$H4 - umlal $ACC3,$IN01_3,${R0} - umlal $ACC0,$IN01_3,${S2} - umlal $ACC4,$IN01_3,${R1} - umlal $ACC1,$IN01_3,${S3} - umlal $ACC2,$IN01_3,${S4} - - umlal $ACC3,$IN01_4,${S4} - umlal $ACC0,$IN01_4,${S1} - umlal $ACC4,$IN01_4,${R0} - umlal $ACC1,$IN01_4,${S2} - umlal $ACC2,$IN01_4,${S3} - -.Lshort_tail: - //////////////////////////////////////////////////////////////// - // horizontal add - - addp $ACC3,$ACC3,$ACC3 - ldp d8,d9,[sp,#16] // meet ABI requirements - addp $ACC0,$ACC0,$ACC0 - ldp d10,d11,[sp,#32] - addp $ACC4,$ACC4,$ACC4 - ldp d12,d13,[sp,#48] - addp $ACC1,$ACC1,$ACC1 - ldp d14,d15,[sp,#64] - addp $ACC2,$ACC2,$ACC2 - - //////////////////////////////////////////////////////////////// - // lazy reduction, but without narrowing - - ushr $T0.2d,$ACC3,#26 - and $ACC3,$ACC3,$MASK.2d - ushr $T1.2d,$ACC0,#26 - and $ACC0,$ACC0,$MASK.2d - - add $ACC4,$ACC4,$T0.2d // h3 -> h4 - add $ACC1,$ACC1,$T1.2d // h0 -> h1 - - ushr $T0.2d,$ACC4,#26 - and $ACC4,$ACC4,$MASK.2d - ushr $T1.2d,$ACC1,#26 - and $ACC1,$ACC1,$MASK.2d - add $ACC2,$ACC2,$T1.2d // h1 -> h2 - - add $ACC0,$ACC0,$T0.2d - shl $T0.2d,$T0.2d,#2 - ushr $T1.2d,$ACC2,#26 - and $ACC2,$ACC2,$MASK.2d - add $ACC0,$ACC0,$T0.2d // h4 -> h0 - add $ACC3,$ACC3,$T1.2d // h2 -> h3 - - ushr $T0.2d,$ACC0,#26 - and $ACC0,$ACC0,$MASK.2d - ushr $T1.2d,$ACC3,#26 - and $ACC3,$ACC3,$MASK.2d - add $ACC1,$ACC1,$T0.2d // h0 -> h1 - add $ACC4,$ACC4,$T1.2d // h3 -> h4 - - //////////////////////////////////////////////////////////////// - // write the result, can be partially reduced - - st4 {$ACC0,$ACC1,$ACC2,$ACC3}[0],[$ctx],#16 - st1 {$ACC4}[0],[$ctx] - -.Lno_data_neon: - ldr x29,[sp],#80 - ret -.size poly1305_blocks_neon,.-poly1305_blocks_neon - -.type poly1305_emit_neon,%function -.align 5 -poly1305_emit_neon: - ldr $is_base2_26,[$ctx,#24] - cbz $is_base2_26,GFp_poly1305_emit - - ldp w10,w11,[$ctx] // load hash value base 2^26 - ldp w12,w13,[$ctx,#8] - ldr w14,[$ctx,#16] - - add $h0,x10,x11,lsl#26 // base 2^26 -> base 2^64 - lsr $h1,x12,#12 - adds $h0,$h0,x12,lsl#52 - add $h1,$h1,x13,lsl#14 - adc $h1,$h1,xzr - lsr $h2,x14,#24 - adds $h1,$h1,x14,lsl#40 - adc $h2,$h2,xzr // can be partially reduced... - - ldp $t0,$t1,[$nonce] // load nonce - - and $d0,$h2,#-4 // ... so reduce - add $d0,$d0,$h2,lsr#2 - and $h2,$h2,#3 - adds $h0,$h0,$d0 - adcs $h1,$h1,xzr - adc $h2,$h2,xzr - - adds $d0,$h0,#5 // compare to modulus - adcs $d1,$h1,xzr - adc $d2,$h2,xzr - - tst $d2,#-4 // see if it's carried/borrowed - - csel $h0,$h0,$d0,eq - csel $h1,$h1,$d1,eq - -#ifdef __ARMEB__ - ror $t0,$t0,#32 // flip nonce words - ror $t1,$t1,#32 -#endif - adds $h0,$h0,$t0 // accumulate nonce - adc $h1,$h1,$t1 -#ifdef __ARMEB__ - rev $h0,$h0 // flip output bytes - rev $h1,$h1 -#endif - stp $h0,$h1,[$mac] // write result - - ret -.size poly1305_emit_neon,.-poly1305_emit_neon - -.align 5 -.Lzeros: -.long 0,0,0,0,0,0,0,0 -.LGFp_armcap_P: -#ifdef __ILP32__ -.long GFp_armcap_P-. -#else -.quad GFp_armcap_P-. -#endif -.asciz "Poly1305 for ARMv8, CRYPTOGAMS by " -.align 2 -___ - -foreach (split("\n",$code)) { - s/\b(shrn\s+v[0-9]+)\.[24]d/$1.2s/ or - s/\b(fmov\s+)v([0-9]+)[^,]*,\s*x([0-9]+)/$1d$2,x$3/ or - (m/\bdup\b/ and (s/\.[24]s/.2d/g or 1)) or - (m/\b(eor|and)/ and (s/\.[248][sdh]/.16b/g or 1)) or - (m/\bum(ul|la)l\b/ and (s/\.4s/.2s/g or 1)) or - (m/\bum(ul|la)l2\b/ and (s/\.2s/.4s/g or 1)) or - (m/\bst[1-4]\s+{[^}]+}\[/ and (s/\.[24]d/.s/g or 1)); - - s/\.[124]([sd])\[/.$1\[/; - - print $_,"\n"; -} -close STDOUT or die "error closing STDOUT"; diff --git a/crypto/poly1305/asm/poly1305-x86.pl b/crypto/poly1305/asm/poly1305-x86.pl deleted file mode 100755 index 3f0e4c416b..0000000000 --- a/crypto/poly1305/asm/poly1305-x86.pl +++ /dev/null @@ -1,1223 +0,0 @@ -#!/usr/bin/env perl -# -# ==================================================================== -# Written by Andy Polyakov for the OpenSSL -# project. The module is, however, dual licensed under OpenSSL and -# CRYPTOGAMS licenses depending on where you obtain it. For further -# details see http://www.openssl.org/~appro/cryptogams/. -# ==================================================================== -# -# This module implements Poly1305 hash for x86. -# -# April 2015 -# -# Numbers are cycles per processed byte with poly1305_blocks alone, -# measured with rdtsc at fixed clock frequency. -# -# IALU/gcc-3.4(*) SSE2(**) AVX2 -# Pentium 15.7/+80% - -# PIII 6.21/+90% - -# P4 19.8/+40% 3.24 -# Core 2 4.85/+90% 1.80 -# Westmere 4.58/+100% 1.43 -# Sandy Bridge 3.90/+100% 1.36 -# Haswell 3.88/+70% 1.18 0.72 -# Silvermont 11.0/+40% 4.80 -# VIA Nano 6.71/+90% 2.47 -# Sledgehammer 3.51/+180% 4.27 -# Bulldozer 4.53/+140% 1.31 -# -# (*) gcc 4.8 for some reason generated worse code; -# (**) besides SSE2 there are floating-point and AVX options; FP -# is deemed unnecessary, because pre-SSE2 processor are too -# old to care about, while it's not the fastest option on -# SSE2-capable ones; AVX is omitted, because it doesn't give -# a lot of improvement, 5-10% depending on processor; - -$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; -push(@INC,"${dir}","${dir}../../perlasm"); -require "x86asm.pl"; - -$output=pop; -open STDOUT,">$output"; - -&asm_init($ARGV[0],"poly1305-x86.pl",$ARGV[$#ARGV] eq "386"); - -$sse2=$avx=0; -for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } - -if ($sse2) { - &static_label("const_sse2"); - &static_label("enter_blocks"); - &static_label("enter_emit"); - &external_label("GFp_ia32cap_P"); - - # This may be set to 2, but valgrind can't do AVX2 on 32-bit. Without a - # way to verify test coverage, keep it disabled. - # The AVX2 code was removed. - $avx = 0; -} - -######################################################################## -# Layout of opaque area is following. -# -# unsigned __int32 h[5]; # current hash value base 2^32 -# unsigned __int32 pad; # is_base2_26 in vector context -# unsigned __int32 r[4]; # key value base 2^32 - -&align(64); -&function_begin("GFp_poly1305_init_asm"); - &mov ("edi",&wparam(0)); # context - &mov ("esi",&wparam(1)); # key - &mov ("ebp",&wparam(2)); # function table - - &xor ("eax","eax"); - &mov (&DWP(4*0,"edi"),"eax"); # zero hash value - &mov (&DWP(4*1,"edi"),"eax"); - &mov (&DWP(4*2,"edi"),"eax"); - &mov (&DWP(4*3,"edi"),"eax"); - &mov (&DWP(4*4,"edi"),"eax"); - &mov (&DWP(4*5,"edi"),"eax"); # is_base2_26 - - &cmp ("esi",0); - &je (&label("nokey")); - - if ($sse2) { - &call (&label("pic_point")); - &set_label("pic_point"); - &blindpop("ebx"); - - &lea ("eax",&DWP("GFp_poly1305_blocks-".&label("pic_point"),"ebx")); - &lea ("edx",&DWP("GFp_poly1305_emit-".&label("pic_point"),"ebx")); - - &picmeup("edi","GFp_ia32cap_P","ebx",&label("pic_point")); - &mov ("ecx",&DWP(0,"edi")); - &and ("ecx",1<<26|1<<24); - &cmp ("ecx",1<<26|1<<24); # SSE2 and XMM? - # The non-SSE2 code was removed. - - &lea ("eax",&DWP("_poly1305_blocks_sse2-".&label("pic_point"),"ebx")); - &lea ("edx",&DWP("_poly1305_emit_sse2-".&label("pic_point"),"ebx")); - - # AVX2 code removed. - - # The non-SSE2 code was removed. - - &mov ("edi",&wparam(0)); # reload context - &mov (&DWP(0,"ebp"),"eax"); # fill function table - &mov (&DWP(4,"ebp"),"edx"); - } - - &mov ("eax",&DWP(4*0,"esi")); # load input key - &mov ("ebx",&DWP(4*1,"esi")); - &mov ("ecx",&DWP(4*2,"esi")); - &mov ("edx",&DWP(4*3,"esi")); - &and ("eax",0x0fffffff); - &and ("ebx",0x0ffffffc); - &and ("ecx",0x0ffffffc); - &and ("edx",0x0ffffffc); - &mov (&DWP(4*6,"edi"),"eax"); - &mov (&DWP(4*7,"edi"),"ebx"); - &mov (&DWP(4*8,"edi"),"ecx"); - &mov (&DWP(4*9,"edi"),"edx"); - - &mov ("eax",$sse2); -&set_label("nokey"); -&function_end("GFp_poly1305_init_asm"); - -($h0,$h1,$h2,$h3,$h4, - $d0,$d1,$d2,$d3, - $r0,$r1,$r2,$r3, - $s1,$s2,$s3)=map(4*$_,(0..15)); - -&function_begin("GFp_poly1305_blocks"); - &mov ("edi",&wparam(0)); # ctx - &mov ("esi",&wparam(1)); # inp - &mov ("ecx",&wparam(2)); # len -&set_label("enter_blocks"); - &and ("ecx",-15); - &jz (&label("nodata")); - - &stack_push(16); - &mov ("eax",&DWP(4*6,"edi")); # r0 - &mov ("ebx",&DWP(4*7,"edi")); # r1 - &lea ("ebp",&DWP(0,"esi","ecx")); # end of input - &mov ("ecx",&DWP(4*8,"edi")); # r2 - &mov ("edx",&DWP(4*9,"edi")); # r3 - - &mov (&wparam(2),"ebp"); - &mov ("ebp","esi"); - - &mov (&DWP($r0,"esp"),"eax"); # r0 - &mov ("eax","ebx"); - &shr ("eax",2); - &mov (&DWP($r1,"esp"),"ebx"); # r1 - &add ("eax","ebx"); # s1 - &mov ("ebx","ecx"); - &shr ("ebx",2); - &mov (&DWP($r2,"esp"),"ecx"); # r2 - &add ("ebx","ecx"); # s2 - &mov ("ecx","edx"); - &shr ("ecx",2); - &mov (&DWP($r3,"esp"),"edx"); # r3 - &add ("ecx","edx"); # s3 - &mov (&DWP($s1,"esp"),"eax"); # s1 - &mov (&DWP($s2,"esp"),"ebx"); # s2 - &mov (&DWP($s3,"esp"),"ecx"); # s3 - - &mov ("eax",&DWP(4*0,"edi")); # load hash value - &mov ("ebx",&DWP(4*1,"edi")); - &mov ("ecx",&DWP(4*2,"edi")); - &mov ("esi",&DWP(4*3,"edi")); - &mov ("edi",&DWP(4*4,"edi")); - &jmp (&label("loop")); - -&set_label("loop",32); - &add ("eax",&DWP(4*0,"ebp")); # accumulate input - &adc ("ebx",&DWP(4*1,"ebp")); - &adc ("ecx",&DWP(4*2,"ebp")); - &adc ("esi",&DWP(4*3,"ebp")); - &lea ("ebp",&DWP(4*4,"ebp")); - &adc ("edi",&wparam(3)); # padbit - - &mov (&DWP($h0,"esp"),"eax"); # put aside hash[+inp] - &mov (&DWP($h3,"esp"),"esi"); - - &mul (&DWP($r0,"esp")); # h0*r0 - &mov (&DWP($h4,"esp"),"edi"); - &mov ("edi","eax"); - &mov ("eax","ebx"); # h1 - &mov ("esi","edx"); - &mul (&DWP($s3,"esp")); # h1*s3 - &add ("edi","eax"); - &mov ("eax","ecx"); # h2 - &adc ("esi","edx"); - &mul (&DWP($s2,"esp")); # h2*s2 - &add ("edi","eax"); - &mov ("eax",&DWP($h3,"esp")); - &adc ("esi","edx"); - &mul (&DWP($s1,"esp")); # h3*s1 - &add ("edi","eax"); - &mov ("eax",&DWP($h0,"esp")); - &adc ("esi","edx"); - - &mul (&DWP($r1,"esp")); # h0*r1 - &mov (&DWP($d0,"esp"),"edi"); - &xor ("edi","edi"); - &add ("esi","eax"); - &mov ("eax","ebx"); # h1 - &adc ("edi","edx"); - &mul (&DWP($r0,"esp")); # h1*r0 - &add ("esi","eax"); - &mov ("eax","ecx"); # h2 - &adc ("edi","edx"); - &mul (&DWP($s3,"esp")); # h2*s3 - &add ("esi","eax"); - &mov ("eax",&DWP($h3,"esp")); - &adc ("edi","edx"); - &mul (&DWP($s2,"esp")); # h3*s2 - &add ("esi","eax"); - &mov ("eax",&DWP($h4,"esp")); - &adc ("edi","edx"); - &imul ("eax",&DWP($s1,"esp")); # h4*s1 - &add ("esi","eax"); - &mov ("eax",&DWP($h0,"esp")); - &adc ("edi",0); - - &mul (&DWP($r2,"esp")); # h0*r2 - &mov (&DWP($d1,"esp"),"esi"); - &xor ("esi","esi"); - &add ("edi","eax"); - &mov ("eax","ebx"); # h1 - &adc ("esi","edx"); - &mul (&DWP($r1,"esp")); # h1*r1 - &add ("edi","eax"); - &mov ("eax","ecx"); # h2 - &adc ("esi","edx"); - &mul (&DWP($r0,"esp")); # h2*r0 - &add ("edi","eax"); - &mov ("eax",&DWP($h3,"esp")); - &adc ("esi","edx"); - &mul (&DWP($s3,"esp")); # h3*s3 - &add ("edi","eax"); - &mov ("eax",&DWP($h4,"esp")); - &adc ("esi","edx"); - &imul ("eax",&DWP($s2,"esp")); # h4*s2 - &add ("edi","eax"); - &mov ("eax",&DWP($h0,"esp")); - &adc ("esi",0); - - &mul (&DWP($r3,"esp")); # h0*r3 - &mov (&DWP($d2,"esp"),"edi"); - &xor ("edi","edi"); - &add ("esi","eax"); - &mov ("eax","ebx"); # h1 - &adc ("edi","edx"); - &mul (&DWP($r2,"esp")); # h1*r2 - &add ("esi","eax"); - &mov ("eax","ecx"); # h2 - &adc ("edi","edx"); - &mul (&DWP($r1,"esp")); # h2*r1 - &add ("esi","eax"); - &mov ("eax",&DWP($h3,"esp")); - &adc ("edi","edx"); - &mul (&DWP($r0,"esp")); # h3*r0 - &add ("esi","eax"); - &mov ("ecx",&DWP($h4,"esp")); - &adc ("edi","edx"); - - &mov ("edx","ecx"); - &imul ("ecx",&DWP($s3,"esp")); # h4*s3 - &add ("esi","ecx"); - &mov ("eax",&DWP($d0,"esp")); - &adc ("edi",0); - - &imul ("edx",&DWP($r0,"esp")); # h4*r0 - &add ("edx","edi"); - - &mov ("ebx",&DWP($d1,"esp")); - &mov ("ecx",&DWP($d2,"esp")); - - &mov ("edi","edx"); # last reduction step - &shr ("edx",2); - &and ("edi",3); - &lea ("edx",&DWP(0,"edx","edx",4)); # *5 - &add ("eax","edx"); - &adc ("ebx",0); - &adc ("ecx",0); - &adc ("esi",0); - &adc ("edi",0); - - &cmp ("ebp",&wparam(2)); # done yet? - &jne (&label("loop")); - - &mov ("edx",&wparam(0)); # ctx - &stack_pop(16); - &mov (&DWP(4*0,"edx"),"eax"); # store hash value - &mov (&DWP(4*1,"edx"),"ebx"); - &mov (&DWP(4*2,"edx"),"ecx"); - &mov (&DWP(4*3,"edx"),"esi"); - &mov (&DWP(4*4,"edx"),"edi"); -&set_label("nodata"); -&function_end("GFp_poly1305_blocks"); - -&function_begin("GFp_poly1305_emit"); - &mov ("ebp",&wparam(0)); # context -&set_label("enter_emit"); - &mov ("edi",&wparam(1)); # output - &mov ("eax",&DWP(4*0,"ebp")); # load hash value - &mov ("ebx",&DWP(4*1,"ebp")); - &mov ("ecx",&DWP(4*2,"ebp")); - &mov ("edx",&DWP(4*3,"ebp")); - &mov ("esi",&DWP(4*4,"ebp")); - - &add ("eax",5); # compare to modulus - &adc ("ebx",0); - &adc ("ecx",0); - &adc ("edx",0); - &adc ("esi",0); - &shr ("esi",2); # did it carry/borrow? - &neg ("esi"); # do we choose hash-modulus? - - &and ("eax","esi"); - &and ("ebx","esi"); - &and ("ecx","esi"); - &and ("edx","esi"); - &mov (&DWP(4*0,"edi"),"eax"); - &mov (&DWP(4*1,"edi"),"ebx"); - &mov (&DWP(4*2,"edi"),"ecx"); - &mov (&DWP(4*3,"edi"),"edx"); - - ¬ ("esi"); # or original hash value? - &mov ("eax",&DWP(4*0,"ebp")); - &mov ("ebx",&DWP(4*1,"ebp")); - &mov ("ecx",&DWP(4*2,"ebp")); - &mov ("edx",&DWP(4*3,"ebp")); - &mov ("ebp",&wparam(2)); - &and ("eax","esi"); - &and ("ebx","esi"); - &and ("ecx","esi"); - &and ("edx","esi"); - &or ("eax",&DWP(4*0,"edi")); - &or ("ebx",&DWP(4*1,"edi")); - &or ("ecx",&DWP(4*2,"edi")); - &or ("edx",&DWP(4*3,"edi")); - - &add ("eax",&DWP(4*0,"ebp")); # accumulate key - &adc ("ebx",&DWP(4*1,"ebp")); - &adc ("ecx",&DWP(4*2,"ebp")); - &adc ("edx",&DWP(4*3,"ebp")); - - &mov (&DWP(4*0,"edi"),"eax"); - &mov (&DWP(4*1,"edi"),"ebx"); - &mov (&DWP(4*2,"edi"),"ecx"); - &mov (&DWP(4*3,"edi"),"edx"); -&function_end("GFp_poly1305_emit"); - -if ($sse2) { -######################################################################## -# Layout of opaque area is following. -# -# unsigned __int32 h[5]; # current hash value base 2^26 -# unsigned __int32 is_base2_26; -# unsigned __int32 r[4]; # key value base 2^32 -# unsigned __int32 pad[2]; -# struct { unsigned __int32 r^4, r^3, r^2, r^1; } r[9]; -# -# where r^n are base 2^26 digits of degrees of multiplier key. There are -# 5 digits, but last four are interleaved with multiples of 5, totalling -# in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4. - -my ($D0,$D1,$D2,$D3,$D4,$T0,$T1,$T2)=map("xmm$_",(0..7)); -my $MASK=$T2; # borrow and keep in mind - -&align (32); -&function_begin_B("_poly1305_init_sse2"); - &movdqu ($D4,&QWP(4*6,"edi")); # key base 2^32 - &lea ("edi",&DWP(16*3,"edi")); # size optimization - &mov ("ebp","esp"); - &sub ("esp",16*(9+5)); - &and ("esp",-16); - - #&pand ($D4,&QWP(96,"ebx")); # magic mask - &movq ($MASK,&QWP(64,"ebx")); - - &movdqa ($D0,$D4); - &movdqa ($D1,$D4); - &movdqa ($D2,$D4); - - &pand ($D0,$MASK); # -> base 2^26 - &psrlq ($D1,26); - &psrldq ($D2,6); - &pand ($D1,$MASK); - &movdqa ($D3,$D2); - &psrlq ($D2,4) - &psrlq ($D3,30); - &pand ($D2,$MASK); - &pand ($D3,$MASK); - &psrldq ($D4,13); - - &lea ("edx",&DWP(16*9,"esp")); # size optimization - &mov ("ecx",2); -&set_label("square"); - &movdqa (&QWP(16*0,"esp"),$D0); - &movdqa (&QWP(16*1,"esp"),$D1); - &movdqa (&QWP(16*2,"esp"),$D2); - &movdqa (&QWP(16*3,"esp"),$D3); - &movdqa (&QWP(16*4,"esp"),$D4); - - &movdqa ($T1,$D1); - &movdqa ($T0,$D2); - &pslld ($T1,2); - &pslld ($T0,2); - &paddd ($T1,$D1); # *5 - &paddd ($T0,$D2); # *5 - &movdqa (&QWP(16*5,"esp"),$T1); - &movdqa (&QWP(16*6,"esp"),$T0); - &movdqa ($T1,$D3); - &movdqa ($T0,$D4); - &pslld ($T1,2); - &pslld ($T0,2); - &paddd ($T1,$D3); # *5 - &paddd ($T0,$D4); # *5 - &movdqa (&QWP(16*7,"esp"),$T1); - &movdqa (&QWP(16*8,"esp"),$T0); - - &pshufd ($T1,$D0,0b01000100); - &movdqa ($T0,$D1); - &pshufd ($D1,$D1,0b01000100); - &pshufd ($D2,$D2,0b01000100); - &pshufd ($D3,$D3,0b01000100); - &pshufd ($D4,$D4,0b01000100); - &movdqa (&QWP(16*0,"edx"),$T1); - &movdqa (&QWP(16*1,"edx"),$D1); - &movdqa (&QWP(16*2,"edx"),$D2); - &movdqa (&QWP(16*3,"edx"),$D3); - &movdqa (&QWP(16*4,"edx"),$D4); - - ################################################################ - # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 - # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 - # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 - # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 - # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 - - &pmuludq ($D4,$D0); # h4*r0 - &pmuludq ($D3,$D0); # h3*r0 - &pmuludq ($D2,$D0); # h2*r0 - &pmuludq ($D1,$D0); # h1*r0 - &pmuludq ($D0,$T1); # h0*r0 - -sub pmuladd { -my $load = shift; -my $base = shift; $base = "esp" if (!defined($base)); - - ################################################################ - # As for choice to "rotate" $T0-$T2 in order to move paddq - # past next multiplication. While it makes code harder to read - # and doesn't have significant effect on most processors, it - # makes a lot of difference on Atom, up to 30% improvement. - - &movdqa ($T1,$T0); - &pmuludq ($T0,&QWP(16*3,$base)); # r1*h3 - &movdqa ($T2,$T1); - &pmuludq ($T1,&QWP(16*2,$base)); # r1*h2 - &paddq ($D4,$T0); - &movdqa ($T0,$T2); - &pmuludq ($T2,&QWP(16*1,$base)); # r1*h1 - &paddq ($D3,$T1); - &$load ($T1,5); # s1 - &pmuludq ($T0,&QWP(16*0,$base)); # r1*h0 - &paddq ($D2,$T2); - &pmuludq ($T1,&QWP(16*4,$base)); # s1*h4 - &$load ($T2,2); # r2^n - &paddq ($D1,$T0); - - &movdqa ($T0,$T2); - &pmuludq ($T2,&QWP(16*2,$base)); # r2*h2 - &paddq ($D0,$T1); - &movdqa ($T1,$T0); - &pmuludq ($T0,&QWP(16*1,$base)); # r2*h1 - &paddq ($D4,$T2); - &$load ($T2,6); # s2^n - &pmuludq ($T1,&QWP(16*0,$base)); # r2*h0 - &paddq ($D3,$T0); - &movdqa ($T0,$T2); - &pmuludq ($T2,&QWP(16*4,$base)); # s2*h4 - &paddq ($D2,$T1); - &pmuludq ($T0,&QWP(16*3,$base)); # s2*h3 - &$load ($T1,3); # r3^n - &paddq ($D1,$T2); - - &movdqa ($T2,$T1); - &pmuludq ($T1,&QWP(16*1,$base)); # r3*h1 - &paddq ($D0,$T0); - &$load ($T0,7); # s3^n - &pmuludq ($T2,&QWP(16*0,$base)); # r3*h0 - &paddq ($D4,$T1); - &movdqa ($T1,$T0); - &pmuludq ($T0,&QWP(16*4,$base)); # s3*h4 - &paddq ($D3,$T2); - &movdqa ($T2,$T1); - &pmuludq ($T1,&QWP(16*3,$base)); # s3*h3 - &paddq ($D2,$T0); - &pmuludq ($T2,&QWP(16*2,$base)); # s3*h2 - &$load ($T0,4); # r4^n - &paddq ($D1,$T1); - - &$load ($T1,8); # s4^n - &pmuludq ($T0,&QWP(16*0,$base)); # r4*h0 - &paddq ($D0,$T2); - &movdqa ($T2,$T1); - &pmuludq ($T1,&QWP(16*4,$base)); # s4*h4 - &paddq ($D4,$T0); - &movdqa ($T0,$T2); - &pmuludq ($T2,&QWP(16*1,$base)); # s4*h1 - &paddq ($D3,$T1); - &movdqa ($T1,$T0); - &pmuludq ($T0,&QWP(16*2,$base)); # s4*h2 - &paddq ($D0,$T2); - &pmuludq ($T1,&QWP(16*3,$base)); # s4*h3 - &movdqa ($MASK,&QWP(64,"ebx")); - &paddq ($D1,$T0); - &paddq ($D2,$T1); -} - &pmuladd (sub { my ($reg,$i)=@_; - &movdqa ($reg,&QWP(16*$i,"esp")); - },"edx"); - -sub lazy_reduction { -my $extra = shift; - - ################################################################ - # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein - # and P. Schwabe - # - # [(*) see discussion in poly1305-armv4 module] - - &movdqa ($T0,$D3); - &pand ($D3,$MASK); - &psrlq ($T0,26); - &$extra () if (defined($extra)); - &paddq ($T0,$D4); # h3 -> h4 - &movdqa ($T1,$D0); - &pand ($D0,$MASK); - &psrlq ($T1,26); - &movdqa ($D4,$T0); - &paddq ($T1,$D1); # h0 -> h1 - &psrlq ($T0,26); - &pand ($D4,$MASK); - &movdqa ($D1,$T1); - &psrlq ($T1,26); - &paddd ($D0,$T0); # favour paddd when - # possible, because - # paddq is "broken" - # on Atom - &psllq ($T0,2); - &paddq ($T1,$D2); # h1 -> h2 - &paddq ($T0,$D0); # h4 -> h0 (*) - &pand ($D1,$MASK); - &movdqa ($D2,$T1); - &psrlq ($T1,26); - &pand ($D2,$MASK); - &paddd ($T1,$D3); # h2 -> h3 - &movdqa ($D0,$T0); - &psrlq ($T0,26); - &movdqa ($D3,$T1); - &psrlq ($T1,26); - &pand ($D0,$MASK); - &paddd ($D1,$T0); # h0 -> h1 - &pand ($D3,$MASK); - &paddd ($D4,$T1); # h3 -> h4 -} - &lazy_reduction (); - - &dec ("ecx"); - &jz (&label("square_break")); - - &punpcklqdq ($D0,&QWP(16*0,"esp")); # 0:r^1:0:r^2 - &punpcklqdq ($D1,&QWP(16*1,"esp")); - &punpcklqdq ($D2,&QWP(16*2,"esp")); - &punpcklqdq ($D3,&QWP(16*3,"esp")); - &punpcklqdq ($D4,&QWP(16*4,"esp")); - &jmp (&label("square")); - -&set_label("square_break"); - &psllq ($D0,32); # -> r^3:0:r^4:0 - &psllq ($D1,32); - &psllq ($D2,32); - &psllq ($D3,32); - &psllq ($D4,32); - &por ($D0,&QWP(16*0,"esp")); # r^3:r^1:r^4:r^2 - &por ($D1,&QWP(16*1,"esp")); - &por ($D2,&QWP(16*2,"esp")); - &por ($D3,&QWP(16*3,"esp")); - &por ($D4,&QWP(16*4,"esp")); - - &pshufd ($D0,$D0,0b10001101); # -> r^1:r^2:r^3:r^4 - &pshufd ($D1,$D1,0b10001101); - &pshufd ($D2,$D2,0b10001101); - &pshufd ($D3,$D3,0b10001101); - &pshufd ($D4,$D4,0b10001101); - - &movdqu (&QWP(16*0,"edi"),$D0); # save the table - &movdqu (&QWP(16*1,"edi"),$D1); - &movdqu (&QWP(16*2,"edi"),$D2); - &movdqu (&QWP(16*3,"edi"),$D3); - &movdqu (&QWP(16*4,"edi"),$D4); - - &movdqa ($T1,$D1); - &movdqa ($T0,$D2); - &pslld ($T1,2); - &pslld ($T0,2); - &paddd ($T1,$D1); # *5 - &paddd ($T0,$D2); # *5 - &movdqu (&QWP(16*5,"edi"),$T1); - &movdqu (&QWP(16*6,"edi"),$T0); - &movdqa ($T1,$D3); - &movdqa ($T0,$D4); - &pslld ($T1,2); - &pslld ($T0,2); - &paddd ($T1,$D3); # *5 - &paddd ($T0,$D4); # *5 - &movdqu (&QWP(16*7,"edi"),$T1); - &movdqu (&QWP(16*8,"edi"),$T0); - - &mov ("esp","ebp"); - &lea ("edi",&DWP(-16*3,"edi")); # size de-optimization - &ret (); -&function_end_B("_poly1305_init_sse2"); - -&align (32); -&function_begin("_poly1305_blocks_sse2"); - &mov ("edi",&wparam(0)); # ctx - &mov ("esi",&wparam(1)); # inp - &mov ("ecx",&wparam(2)); # len - - &mov ("eax",&DWP(4*5,"edi")); # is_base2_26 - &and ("ecx",-16); - &jz (&label("nodata")); - &cmp ("ecx",64); - &jae (&label("enter_sse2")); - &test ("eax","eax"); # is_base2_26? - &jz (&label("enter_blocks")); - -&set_label("enter_sse2",16); - &call (&label("pic_point")); -&set_label("pic_point"); - &blindpop("ebx"); - &lea ("ebx",&DWP(&label("const_sse2")."-".&label("pic_point"),"ebx")); - - &test ("eax","eax"); # is_base2_26? - &jnz (&label("base2_26")); - - &call ("_poly1305_init_sse2"); - - ################################################# base 2^32 -> base 2^26 - &mov ("eax",&DWP(0,"edi")); - &mov ("ecx",&DWP(3,"edi")); - &mov ("edx",&DWP(6,"edi")); - &mov ("esi",&DWP(9,"edi")); - &mov ("ebp",&DWP(13,"edi")); - &mov (&DWP(4*5,"edi"),1); # is_base2_26 - - &shr ("ecx",2); - &and ("eax",0x3ffffff); - &shr ("edx",4); - &and ("ecx",0x3ffffff); - &shr ("esi",6); - &and ("edx",0x3ffffff); - - &movd ($D0,"eax"); - &movd ($D1,"ecx"); - &movd ($D2,"edx"); - &movd ($D3,"esi"); - &movd ($D4,"ebp"); - - &mov ("esi",&wparam(1)); # [reload] inp - &mov ("ecx",&wparam(2)); # [reload] len - &jmp (&label("base2_32")); - -&set_label("base2_26",16); - &movd ($D0,&DWP(4*0,"edi")); # load hash value - &movd ($D1,&DWP(4*1,"edi")); - &movd ($D2,&DWP(4*2,"edi")); - &movd ($D3,&DWP(4*3,"edi")); - &movd ($D4,&DWP(4*4,"edi")); - &movdqa ($MASK,&QWP(64,"ebx")); - -&set_label("base2_32"); - &mov ("eax",&wparam(3)); # padbit - &mov ("ebp","esp"); - - &sub ("esp",16*(5+5+5+9+9)); - &and ("esp",-16); - - &lea ("edi",&DWP(16*3,"edi")); # size optimization - &shl ("eax",24); # padbit - - &test ("ecx",31); - &jz (&label("even")); - - ################################################################ - # process single block, with SSE2, because it's still faster - # even though half of result is discarded - - &movdqu ($T1,&QWP(0,"esi")); # input - &lea ("esi",&DWP(16,"esi")); - - &movdqa ($T0,$T1); # -> base 2^26 ... - &pand ($T1,$MASK); - &paddd ($D0,$T1); # ... and accumuate - - &movdqa ($T1,$T0); - &psrlq ($T0,26); - &psrldq ($T1,6); - &pand ($T0,$MASK); - &paddd ($D1,$T0); - - &movdqa ($T0,$T1); - &psrlq ($T1,4); - &pand ($T1,$MASK); - &paddd ($D2,$T1); - - &movdqa ($T1,$T0); - &psrlq ($T0,30); - &pand ($T0,$MASK); - &psrldq ($T1,7); - &paddd ($D3,$T0); - - &movd ($T0,"eax"); # padbit - &paddd ($D4,$T1); - &movd ($T1,&DWP(16*0+12,"edi")); # r0 - &paddd ($D4,$T0); - - &movdqa (&QWP(16*0,"esp"),$D0); - &movdqa (&QWP(16*1,"esp"),$D1); - &movdqa (&QWP(16*2,"esp"),$D2); - &movdqa (&QWP(16*3,"esp"),$D3); - &movdqa (&QWP(16*4,"esp"),$D4); - - ################################################################ - # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 - # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 - # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 - # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 - # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 - - &pmuludq ($D0,$T1); # h4*r0 - &pmuludq ($D1,$T1); # h3*r0 - &pmuludq ($D2,$T1); # h2*r0 - &movd ($T0,&DWP(16*1+12,"edi")); # r1 - &pmuludq ($D3,$T1); # h1*r0 - &pmuludq ($D4,$T1); # h0*r0 - - &pmuladd (sub { my ($reg,$i)=@_; - &movd ($reg,&DWP(16*$i+12,"edi")); - }); - - &lazy_reduction (); - - &sub ("ecx",16); - &jz (&label("done")); - -&set_label("even"); - &lea ("edx",&DWP(16*(5+5+5+9),"esp"));# size optimization - &lea ("eax",&DWP(-16*2,"esi")); - &sub ("ecx",64); - - ################################################################ - # expand and copy pre-calculated table to stack - - &movdqu ($T0,&QWP(16*0,"edi")); # r^1:r^2:r^3:r^4 - &pshufd ($T1,$T0,0b01000100); # duplicate r^3:r^4 - &cmovb ("esi","eax"); - &pshufd ($T0,$T0,0b11101110); # duplicate r^1:r^2 - &movdqa (&QWP(16*0,"edx"),$T1); - &lea ("eax",&DWP(16*10,"esp")); - &movdqu ($T1,&QWP(16*1,"edi")); - &movdqa (&QWP(16*(0-9),"edx"),$T0); - &pshufd ($T0,$T1,0b01000100); - &pshufd ($T1,$T1,0b11101110); - &movdqa (&QWP(16*1,"edx"),$T0); - &movdqu ($T0,&QWP(16*2,"edi")); - &movdqa (&QWP(16*(1-9),"edx"),$T1); - &pshufd ($T1,$T0,0b01000100); - &pshufd ($T0,$T0,0b11101110); - &movdqa (&QWP(16*2,"edx"),$T1); - &movdqu ($T1,&QWP(16*3,"edi")); - &movdqa (&QWP(16*(2-9),"edx"),$T0); - &pshufd ($T0,$T1,0b01000100); - &pshufd ($T1,$T1,0b11101110); - &movdqa (&QWP(16*3,"edx"),$T0); - &movdqu ($T0,&QWP(16*4,"edi")); - &movdqa (&QWP(16*(3-9),"edx"),$T1); - &pshufd ($T1,$T0,0b01000100); - &pshufd ($T0,$T0,0b11101110); - &movdqa (&QWP(16*4,"edx"),$T1); - &movdqu ($T1,&QWP(16*5,"edi")); - &movdqa (&QWP(16*(4-9),"edx"),$T0); - &pshufd ($T0,$T1,0b01000100); - &pshufd ($T1,$T1,0b11101110); - &movdqa (&QWP(16*5,"edx"),$T0); - &movdqu ($T0,&QWP(16*6,"edi")); - &movdqa (&QWP(16*(5-9),"edx"),$T1); - &pshufd ($T1,$T0,0b01000100); - &pshufd ($T0,$T0,0b11101110); - &movdqa (&QWP(16*6,"edx"),$T1); - &movdqu ($T1,&QWP(16*7,"edi")); - &movdqa (&QWP(16*(6-9),"edx"),$T0); - &pshufd ($T0,$T1,0b01000100); - &pshufd ($T1,$T1,0b11101110); - &movdqa (&QWP(16*7,"edx"),$T0); - &movdqu ($T0,&QWP(16*8,"edi")); - &movdqa (&QWP(16*(7-9),"edx"),$T1); - &pshufd ($T1,$T0,0b01000100); - &pshufd ($T0,$T0,0b11101110); - &movdqa (&QWP(16*8,"edx"),$T1); - &movdqa (&QWP(16*(8-9),"edx"),$T0); - -sub load_input { -my ($inpbase,$offbase)=@_; - - &movdqu ($T0,&QWP($inpbase+0,"esi")); # load input - &movdqu ($T1,&QWP($inpbase+16,"esi")); - &lea ("esi",&DWP(16*2,"esi")); - - &movdqa (&QWP($offbase+16*2,"esp"),$D2); - &movdqa (&QWP($offbase+16*3,"esp"),$D3); - &movdqa (&QWP($offbase+16*4,"esp"),$D4); - - &movdqa ($D2,$T0); # splat input - &movdqa ($D3,$T1); - &psrldq ($D2,6); - &psrldq ($D3,6); - &movdqa ($D4,$T0); - &punpcklqdq ($D2,$D3); # 2:3 - &punpckhqdq ($D4,$T1); # 4 - &punpcklqdq ($T0,$T1); # 0:1 - - &movdqa ($D3,$D2); - &psrlq ($D2,4); - &psrlq ($D3,30); - &movdqa ($T1,$T0); - &psrlq ($D4,40); # 4 - &psrlq ($T1,26); - &pand ($T0,$MASK); # 0 - &pand ($T1,$MASK); # 1 - &pand ($D2,$MASK); # 2 - &pand ($D3,$MASK); # 3 - &por ($D4,&QWP(0,"ebx")); # padbit, yes, always - - &movdqa (&QWP($offbase+16*0,"esp"),$D0) if ($offbase); - &movdqa (&QWP($offbase+16*1,"esp"),$D1) if ($offbase); -} - &load_input (16*2,16*5); - - &jbe (&label("skip_loop")); - &jmp (&label("loop")); - -&set_label("loop",32); - ################################################################ - # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 - # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r - # \___________________/ - # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 - # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r - # \___________________/ \____________________/ - ################################################################ - - &movdqa ($T2,&QWP(16*(0-9),"edx")); # r0^2 - &movdqa (&QWP(16*1,"eax"),$T1); - &movdqa (&QWP(16*2,"eax"),$D2); - &movdqa (&QWP(16*3,"eax"),$D3); - &movdqa (&QWP(16*4,"eax"),$D4); - - ################################################################ - # d4 = h4*r0 + h0*r4 + h1*r3 + h2*r2 + h3*r1 - # d3 = h3*r0 + h0*r3 + h1*r2 + h2*r1 + h4*5*r4 - # d2 = h2*r0 + h0*r2 + h1*r1 + h3*5*r4 + h4*5*r3 - # d1 = h1*r0 + h0*r1 + h2*5*r4 + h3*5*r3 + h4*5*r2 - # d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1 - - &movdqa ($D1,$T0); - &pmuludq ($T0,$T2); # h0*r0 - &movdqa ($D0,$T1); - &pmuludq ($T1,$T2); # h1*r0 - &pmuludq ($D2,$T2); # h2*r0 - &pmuludq ($D3,$T2); # h3*r0 - &pmuludq ($D4,$T2); # h4*r0 - -sub pmuladd_alt { -my $addr = shift; - - &pmuludq ($D0,&$addr(8)); # h1*s4 - &movdqa ($T2,$D1); - &pmuludq ($D1,&$addr(1)); # h0*r1 - &paddq ($D0,$T0); - &movdqa ($T0,$T2); - &pmuludq ($T2,&$addr(2)); # h0*r2 - &paddq ($D1,$T1); - &movdqa ($T1,$T0); - &pmuludq ($T0,&$addr(3)); # h0*r3 - &paddq ($D2,$T2); - &movdqa ($T2,&QWP(16*1,"eax")); # pull h1 - &pmuludq ($T1,&$addr(4)); # h0*r4 - &paddq ($D3,$T0); - - &movdqa ($T0,$T2); - &pmuludq ($T2,&$addr(1)); # h1*r1 - &paddq ($D4,$T1); - &movdqa ($T1,$T0); - &pmuludq ($T0,&$addr(2)); # h1*r2 - &paddq ($D2,$T2); - &movdqa ($T2,&QWP(16*2,"eax")); # pull h2 - &pmuludq ($T1,&$addr(3)); # h1*r3 - &paddq ($D3,$T0); - &movdqa ($T0,$T2); - &pmuludq ($T2,&$addr(7)); # h2*s3 - &paddq ($D4,$T1); - &movdqa ($T1,$T0); - &pmuludq ($T0,&$addr(8)); # h2*s4 - &paddq ($D0,$T2); - - &movdqa ($T2,$T1); - &pmuludq ($T1,&$addr(1)); # h2*r1 - &paddq ($D1,$T0); - &movdqa ($T0,&QWP(16*3,"eax")); # pull h3 - &pmuludq ($T2,&$addr(2)); # h2*r2 - &paddq ($D3,$T1); - &movdqa ($T1,$T0); - &pmuludq ($T0,&$addr(6)); # h3*s2 - &paddq ($D4,$T2); - &movdqa ($T2,$T1); - &pmuludq ($T1,&$addr(7)); # h3*s3 - &paddq ($D0,$T0); - &movdqa ($T0,$T2); - &pmuludq ($T2,&$addr(8)); # h3*s4 - &paddq ($D1,$T1); - - &movdqa ($T1,&QWP(16*4,"eax")); # pull h4 - &pmuludq ($T0,&$addr(1)); # h3*r1 - &paddq ($D2,$T2); - &movdqa ($T2,$T1); - &pmuludq ($T1,&$addr(8)); # h4*s4 - &paddq ($D4,$T0); - &movdqa ($T0,$T2); - &pmuludq ($T2,&$addr(5)); # h4*s1 - &paddq ($D3,$T1); - &movdqa ($T1,$T0); - &pmuludq ($T0,&$addr(6)); # h4*s2 - &paddq ($D0,$T2); - &movdqa ($MASK,&QWP(64,"ebx")); - &pmuludq ($T1,&$addr(7)); # h4*s3 - &paddq ($D1,$T0); - &paddq ($D2,$T1); -} - &pmuladd_alt (sub { my $i=shift; &QWP(16*($i-9),"edx"); }); - - &load_input (-16*2,0); - &lea ("eax",&DWP(-16*2,"esi")); - &sub ("ecx",64); - - &paddd ($T0,&QWP(16*(5+0),"esp")); # add hash value - &paddd ($T1,&QWP(16*(5+1),"esp")); - &paddd ($D2,&QWP(16*(5+2),"esp")); - &paddd ($D3,&QWP(16*(5+3),"esp")); - &paddd ($D4,&QWP(16*(5+4),"esp")); - - &cmovb ("esi","eax"); - &lea ("eax",&DWP(16*10,"esp")); - - &movdqa ($T2,&QWP(16*0,"edx")); # r0^4 - &movdqa (&QWP(16*1,"esp"),$D1); - &movdqa (&QWP(16*1,"eax"),$T1); - &movdqa (&QWP(16*2,"eax"),$D2); - &movdqa (&QWP(16*3,"eax"),$D3); - &movdqa (&QWP(16*4,"eax"),$D4); - - ################################################################ - # d4 += h4*r0 + h0*r4 + h1*r3 + h2*r2 + h3*r1 - # d3 += h3*r0 + h0*r3 + h1*r2 + h2*r1 + h4*5*r4 - # d2 += h2*r0 + h0*r2 + h1*r1 + h3*5*r4 + h4*5*r3 - # d1 += h1*r0 + h0*r1 + h2*5*r4 + h3*5*r3 + h4*5*r2 - # d0 += h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1 - - &movdqa ($D1,$T0); - &pmuludq ($T0,$T2); # h0*r0 - &paddq ($T0,$D0); - &movdqa ($D0,$T1); - &pmuludq ($T1,$T2); # h1*r0 - &pmuludq ($D2,$T2); # h2*r0 - &pmuludq ($D3,$T2); # h3*r0 - &pmuludq ($D4,$T2); # h4*r0 - - &paddq ($T1,&QWP(16*1,"esp")); - &paddq ($D2,&QWP(16*2,"esp")); - &paddq ($D3,&QWP(16*3,"esp")); - &paddq ($D4,&QWP(16*4,"esp")); - - &pmuladd_alt (sub { my $i=shift; &QWP(16*$i,"edx"); }); - - &lazy_reduction (); - - &load_input (16*2,16*5); - - &ja (&label("loop")); - -&set_label("skip_loop"); - ################################################################ - # multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 - - &pshufd ($T2,&QWP(16*(0-9),"edx"),0x10);# r0^n - &add ("ecx",32); - &jnz (&label("long_tail")); - - &paddd ($T0,$D0); # add hash value - &paddd ($T1,$D1); - &paddd ($D2,&QWP(16*7,"esp")); - &paddd ($D3,&QWP(16*8,"esp")); - &paddd ($D4,&QWP(16*9,"esp")); - -&set_label("long_tail"); - - &movdqa (&QWP(16*0,"eax"),$T0); - &movdqa (&QWP(16*1,"eax"),$T1); - &movdqa (&QWP(16*2,"eax"),$D2); - &movdqa (&QWP(16*3,"eax"),$D3); - &movdqa (&QWP(16*4,"eax"),$D4); - - ################################################################ - # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 - # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 - # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 - # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 - # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 - - &pmuludq ($T0,$T2); # h0*r0 - &pmuludq ($T1,$T2); # h1*r0 - &pmuludq ($D2,$T2); # h2*r0 - &movdqa ($D0,$T0); - &pshufd ($T0,&QWP(16*(1-9),"edx"),0x10);# r1^n - &pmuludq ($D3,$T2); # h3*r0 - &movdqa ($D1,$T1); - &pmuludq ($D4,$T2); # h4*r0 - - &pmuladd (sub { my ($reg,$i)=@_; - &pshufd ($reg,&QWP(16*($i-9),"edx"),0x10); - },"eax"); - - &jz (&label("short_tail")); - - &load_input (-16*2,0); - - &pshufd ($T2,&QWP(16*0,"edx"),0x10); # r0^n - &paddd ($T0,&QWP(16*5,"esp")); # add hash value - &paddd ($T1,&QWP(16*6,"esp")); - &paddd ($D2,&QWP(16*7,"esp")); - &paddd ($D3,&QWP(16*8,"esp")); - &paddd ($D4,&QWP(16*9,"esp")); - - ################################################################ - # multiply inp[0:1] by r^4:r^3 and accumulate - - &movdqa (&QWP(16*0,"esp"),$T0); - &pmuludq ($T0,$T2); # h0*r0 - &movdqa (&QWP(16*1,"esp"),$T1); - &pmuludq ($T1,$T2); # h1*r0 - &paddq ($D0,$T0); - &movdqa ($T0,$D2); - &pmuludq ($D2,$T2); # h2*r0 - &paddq ($D1,$T1); - &movdqa ($T1,$D3); - &pmuludq ($D3,$T2); # h3*r0 - &paddq ($D2,&QWP(16*2,"esp")); - &movdqa (&QWP(16*2,"esp"),$T0); - &pshufd ($T0,&QWP(16*1,"edx"),0x10); # r1^n - &paddq ($D3,&QWP(16*3,"esp")); - &movdqa (&QWP(16*3,"esp"),$T1); - &movdqa ($T1,$D4); - &pmuludq ($D4,$T2); # h4*r0 - &paddq ($D4,&QWP(16*4,"esp")); - &movdqa (&QWP(16*4,"esp"),$T1); - - &pmuladd (sub { my ($reg,$i)=@_; - &pshufd ($reg,&QWP(16*$i,"edx"),0x10); - }); - -&set_label("short_tail"); - - ################################################################ - # horizontal addition - - &pshufd ($T1,$D4,0b01001110); - &pshufd ($T0,$D3,0b01001110); - &paddq ($D4,$T1); - &paddq ($D3,$T0); - &pshufd ($T1,$D0,0b01001110); - &pshufd ($T0,$D1,0b01001110); - &paddq ($D0,$T1); - &paddq ($D1,$T0); - &pshufd ($T1,$D2,0b01001110); - #&paddq ($D2,$T1); - - &lazy_reduction (sub { &paddq ($D2,$T1) }); - -&set_label("done"); - &movd (&DWP(-16*3+4*0,"edi"),$D0); # store hash value - &movd (&DWP(-16*3+4*1,"edi"),$D1); - &movd (&DWP(-16*3+4*2,"edi"),$D2); - &movd (&DWP(-16*3+4*3,"edi"),$D3); - &movd (&DWP(-16*3+4*4,"edi"),$D4); - &mov ("esp","ebp"); -&set_label("nodata"); -&function_end("_poly1305_blocks_sse2"); - -&align (32); -&function_begin("_poly1305_emit_sse2"); - &mov ("ebp",&wparam(0)); # context - - &cmp (&DWP(4*5,"ebp"),0); # is_base2_26? - &je (&label("enter_emit")); - - &mov ("eax",&DWP(4*0,"ebp")); # load hash value - &mov ("edi",&DWP(4*1,"ebp")); - &mov ("ecx",&DWP(4*2,"ebp")); - &mov ("edx",&DWP(4*3,"ebp")); - &mov ("esi",&DWP(4*4,"ebp")); - - &mov ("ebx","edi"); # base 2^26 -> base 2^32 - &shl ("edi",26); - &shr ("ebx",6); - &add ("eax","edi"); - &mov ("edi","ecx"); - &adc ("ebx",0); - - &shl ("edi",20); - &shr ("ecx",12); - &add ("ebx","edi"); - &mov ("edi","edx"); - &adc ("ecx",0); - - &shl ("edi",14); - &shr ("edx",18); - &add ("ecx","edi"); - &mov ("edi","esi"); - &adc ("edx",0); - - &shl ("edi",8); - &shr ("esi",24); - &add ("edx","edi"); - &adc ("esi",0); # can be partially reduced - - &mov ("edi","esi"); # final reduction - &and ("esi",3); - &shr ("edi",2); - &lea ("ebp",&DWP(0,"edi","edi",4)); # *5 - &mov ("edi",&wparam(1)); # output - &add ("eax","ebp"); - &mov ("ebp",&wparam(2)); # key - &adc ("ebx",0); - &adc ("ecx",0); - &adc ("edx",0); - &adc ("esi",0); - - &movd ($D0,"eax"); # offload original hash value - &add ("eax",5); # compare to modulus - &movd ($D1,"ebx"); - &adc ("ebx",0); - &movd ($D2,"ecx"); - &adc ("ecx",0); - &movd ($D3,"edx"); - &adc ("edx",0); - &adc ("esi",0); - &shr ("esi",2); # did it carry/borrow? - - &neg ("esi"); # do we choose (hash-modulus) ... - &and ("eax","esi"); - &and ("ebx","esi"); - &and ("ecx","esi"); - &and ("edx","esi"); - &mov (&DWP(4*0,"edi"),"eax"); - &movd ("eax",$D0); - &mov (&DWP(4*1,"edi"),"ebx"); - &movd ("ebx",$D1); - &mov (&DWP(4*2,"edi"),"ecx"); - &movd ("ecx",$D2); - &mov (&DWP(4*3,"edi"),"edx"); - &movd ("edx",$D3); - - ¬ ("esi"); # ... or original hash value? - &and ("eax","esi"); - &and ("ebx","esi"); - &or ("eax",&DWP(4*0,"edi")); - &and ("ecx","esi"); - &or ("ebx",&DWP(4*1,"edi")); - &and ("edx","esi"); - &or ("ecx",&DWP(4*2,"edi")); - &or ("edx",&DWP(4*3,"edi")); - - &add ("eax",&DWP(4*0,"ebp")); # accumulate key - &adc ("ebx",&DWP(4*1,"ebp")); - &mov (&DWP(4*0,"edi"),"eax"); - &adc ("ecx",&DWP(4*2,"ebp")); - &mov (&DWP(4*1,"edi"),"ebx"); - &adc ("edx",&DWP(4*3,"ebp")); - &mov (&DWP(4*2,"edi"),"ecx"); - &mov (&DWP(4*3,"edi"),"edx"); -&function_end("_poly1305_emit_sse2"); - -# The AVX2 code was removed. - -&set_label("const_sse2",64); - &data_word(1<<24,0, 1<<24,0, 1<<24,0, 1<<24,0); - &data_word(0,0, 0,0, 0,0, 0,0); - &data_word(0x03ffffff,0,0x03ffffff,0, 0x03ffffff,0, 0x03ffffff,0); - &data_word(0x0fffffff,0x0ffffffc,0x0ffffffc,0x0ffffffc); -} -&asciz ("Poly1305 for x86, CRYPTOGAMS by "); -&align (4); - -&asm_finish(); - -close STDOUT or die "error closing STDOUT"; diff --git a/crypto/poly1305/asm/poly1305-x86_64.pl b/crypto/poly1305/asm/poly1305-x86_64.pl deleted file mode 100755 index d1b547084a..0000000000 --- a/crypto/poly1305/asm/poly1305-x86_64.pl +++ /dev/null @@ -1,2243 +0,0 @@ -#!/usr/bin/env perl -# -# ==================================================================== -# Written by Andy Polyakov for the OpenSSL -# project. The module is, however, dual licensed under OpenSSL and -# CRYPTOGAMS licenses depending on where you obtain it. For further -# details see http://www.openssl.org/~appro/cryptogams/. -# ==================================================================== -# -# This module implements Poly1305 hash for x86_64. -# -# March 2015 -# -# Numbers are cycles per processed byte with poly1305_blocks alone, -# measured with rdtsc at fixed clock frequency. -# -# IALU/gcc-4.8(*) AVX(**) AVX2 -# P4 4.46/+120% - -# Core 2 2.41/+90% - -# Westmere 1.88/+120% - -# Sandy Bridge 1.39/+140% 1.10 -# Haswell 1.14/+175% 1.11 0.65 -# Skylake 1.13/+120% 0.96 0.51 -# Silvermont 2.83/+95% - -# VIA Nano 1.82/+150% - -# Sledgehammer 1.38/+160% - -# Bulldozer 2.30/+130% 0.97 -# -# (*) improvement coefficients relative to clang are more modest and -# are ~50% on most processors, in both cases we are comparing to -# __int128 code; -# (**) SSE2 implementation was attempted, but among non-AVX processors -# it was faster than integer-only code only on older Intel P4 and -# Core processors, 50-30%, less newer processor is, but slower on -# contemporary ones, for example almost 2x slower on Atom, and as -# former are naturally disappearing, SSE2 is deemed unnecessary; - -$flavour = shift; -$output = shift; -if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } - -$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); - -$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; -( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or -( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or -die "can't locate x86_64-xlate.pl"; - -$avx = 2; - -open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; -*STDOUT=*OUT; - -my ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx"); -my ($mac,$nonce)=($inp,$len); # *_emit arguments -my ($d1,$d2,$d3, $r0,$r1,$s1)=map("%r$_",(8..13)); -my ($h0,$h1,$h2)=("%r14","%rbx","%rbp"); - -sub poly1305_iteration { -# input: copy of $r1 in %rax, $h0-$h2, $r0-$r1 -# output: $h0-$h2 *= $r0-$r1 -$code.=<<___; - mulq $h0 # h0*r1 - mov %rax,$d2 - mov $r0,%rax - mov %rdx,$d3 - - mulq $h0 # h0*r0 - mov %rax,$h0 # future $h0 - mov $r0,%rax - mov %rdx,$d1 - - mulq $h1 # h1*r0 - add %rax,$d2 - mov $s1,%rax - adc %rdx,$d3 - - mulq $h1 # h1*s1 - mov $h2,$h1 # borrow $h1 - add %rax,$h0 - adc %rdx,$d1 - - imulq $s1,$h1 # h2*s1 - add $h1,$d2 - mov $d1,$h1 - adc \$0,$d3 - - imulq $r0,$h2 # h2*r0 - add $d2,$h1 - mov \$-4,%rax # mask value - adc $h2,$d3 - - and $d3,%rax # last reduction step - mov $d3,$h2 - shr \$2,$d3 - and \$3,$h2 - add $d3,%rax - add %rax,$h0 - adc \$0,$h1 - adc \$0,$h2 -___ -} - -######################################################################## -# Layout of opaque area is following. -# -# unsigned __int64 h[3]; # current hash value base 2^64 -# unsigned __int64 r[2]; # key value base 2^64 - -$code.=<<___; -.text - -.extern GFp_ia32cap_P - -.globl GFp_poly1305_init_asm -.hidden GFp_poly1305_init_asm -.globl GFp_poly1305_blocks -.hidden GFp_poly1305_blocks -.globl GFp_poly1305_emit -.hidden GFp_poly1305_emit - -.type GFp_poly1305_init_asm,\@function,3 -.align 32 -GFp_poly1305_init_asm: - xor %rax,%rax - mov %rax,0($ctx) # initialize hash value - mov %rax,8($ctx) - mov %rax,16($ctx) - - cmp \$0,$inp - je .Lno_key - - lea GFp_poly1305_blocks(%rip),%r10 - lea GFp_poly1305_emit(%rip),%r11 -___ -$code.=<<___ if ($avx); - mov GFp_ia32cap_P+4(%rip),%r9 - lea poly1305_blocks_avx(%rip),%rax - lea poly1305_emit_avx(%rip),%rcx - bt \$`60-32`,%r9 # AVX? - cmovc %rax,%r10 - cmovc %rcx,%r11 -___ -$code.=<<___ if ($avx>1); - lea poly1305_blocks_avx2(%rip),%rax - bt \$`5+32`,%r9 # AVX2? - cmovc %rax,%r10 -___ -$code.=<<___; - mov \$0x0ffffffc0fffffff,%rax - mov \$0x0ffffffc0ffffffc,%rcx - and 0($inp),%rax - and 8($inp),%rcx - mov %rax,24($ctx) - mov %rcx,32($ctx) -___ -$code.=<<___ if ($flavour !~ /elf32/); - mov %r10,0(%rdx) - mov %r11,8(%rdx) -___ -$code.=<<___ if ($flavour =~ /elf32/); - mov %r10d,0(%rdx) - mov %r11d,4(%rdx) -___ -$code.=<<___; - mov \$1,%eax -.Lno_key: - ret -.size GFp_poly1305_init_asm,.-GFp_poly1305_init_asm - -.type GFp_poly1305_blocks,\@function,4 -.align 32 -GFp_poly1305_blocks: -.Lblocks: - shr \$4,$len - jz .Lno_data # too short - - push %rbx - push %rbp - push %r12 - push %r13 - push %r14 - push %r15 -.Lblocks_body: - - mov $len,%r15 # reassign $len - - mov 24($ctx),$r0 # load r - mov 32($ctx),$s1 - - mov 0($ctx),$h0 # load hash value - mov 8($ctx),$h1 - mov 16($ctx),$h2 - - mov $s1,$r1 - shr \$2,$s1 - mov $r1,%rax - add $r1,$s1 # s1 = r1 + (r1 >> 2) - jmp .Loop - -.align 32 -.Loop: - add 0($inp),$h0 # accumulate input - adc 8($inp),$h1 - lea 16($inp),$inp - adc $padbit,$h2 -___ - &poly1305_iteration(); -$code.=<<___; - mov $r1,%rax - dec %r15 # len-=16 - jnz .Loop - - mov $h0,0($ctx) # store hash value - mov $h1,8($ctx) - mov $h2,16($ctx) - - mov 0(%rsp),%r15 - mov 8(%rsp),%r14 - mov 16(%rsp),%r13 - mov 24(%rsp),%r12 - mov 32(%rsp),%rbp - mov 40(%rsp),%rbx - lea 48(%rsp),%rsp -.Lno_data: -.Lblocks_epilogue: - ret -.size GFp_poly1305_blocks,.-GFp_poly1305_blocks - -.type GFp_poly1305_emit,\@function,3 -.align 32 -GFp_poly1305_emit: -.Lemit: - mov 0($ctx),%r8 # load hash value - mov 8($ctx),%r9 - mov 16($ctx),%r10 - - mov %r8,%rax - add \$5,%r8 # compare to modulus - mov %r9,%rcx - adc \$0,%r9 - adc \$0,%r10 - shr \$2,%r10 # did 130-bit value overfow? - cmovnz %r8,%rax - cmovnz %r9,%rcx - - add 0($nonce),%rax # accumulate nonce - adc 8($nonce),%rcx - mov %rax,0($mac) # write result - mov %rcx,8($mac) - - ret -.size GFp_poly1305_emit,.-GFp_poly1305_emit -___ -if ($avx) { - -######################################################################## -# Layout of opaque area is following. -# -# unsigned __int32 h[5]; # current hash value base 2^26 -# unsigned __int32 is_base2_26; -# unsigned __int64 r[2]; # key value base 2^64 -# unsigned __int64 pad; -# struct { unsigned __int32 r^2, r^1, r^4, r^3; } r[9]; -# -# where r^n are base 2^26 digits of degrees of multiplier key. There are -# 5 digits, but last four are interleaved with multiples of 5, totalling -# in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4. - -my ($H0,$H1,$H2,$H3,$H4, $T0,$T1,$T2,$T3,$T4, $D0,$D1,$D2,$D3,$D4, $MASK) = - map("%xmm$_",(0..15)); - -$code.=<<___; -.type __poly1305_block,\@abi-omnipotent -.align 32 -__poly1305_block: -___ - &poly1305_iteration(); -$code.=<<___; - ret -.size __poly1305_block,.-__poly1305_block - -.type __poly1305_init_avx,\@abi-omnipotent -.align 32 -__poly1305_init_avx: - mov $r0,$h0 - mov $r1,$h1 - xor $h2,$h2 - - lea 48+64($ctx),$ctx # size optimization - - mov $r1,%rax - call __poly1305_block # r^2 - - mov \$0x3ffffff,%eax # save interleaved r^2 and r base 2^26 - mov \$0x3ffffff,%edx - mov $h0,$d1 - and $h0#d,%eax - mov $r0,$d2 - and $r0#d,%edx - mov %eax,`16*0+0-64`($ctx) - shr \$26,$d1 - mov %edx,`16*0+4-64`($ctx) - shr \$26,$d2 - - mov \$0x3ffffff,%eax - mov \$0x3ffffff,%edx - and $d1#d,%eax - and $d2#d,%edx - mov %eax,`16*1+0-64`($ctx) - lea (%rax,%rax,4),%eax # *5 - mov %edx,`16*1+4-64`($ctx) - lea (%rdx,%rdx,4),%edx # *5 - mov %eax,`16*2+0-64`($ctx) - shr \$26,$d1 - mov %edx,`16*2+4-64`($ctx) - shr \$26,$d2 - - mov $h1,%rax - mov $r1,%rdx - shl \$12,%rax - shl \$12,%rdx - or $d1,%rax - or $d2,%rdx - and \$0x3ffffff,%eax - and \$0x3ffffff,%edx - mov %eax,`16*3+0-64`($ctx) - lea (%rax,%rax,4),%eax # *5 - mov %edx,`16*3+4-64`($ctx) - lea (%rdx,%rdx,4),%edx # *5 - mov %eax,`16*4+0-64`($ctx) - mov $h1,$d1 - mov %edx,`16*4+4-64`($ctx) - mov $r1,$d2 - - mov \$0x3ffffff,%eax - mov \$0x3ffffff,%edx - shr \$14,$d1 - shr \$14,$d2 - and $d1#d,%eax - and $d2#d,%edx - mov %eax,`16*5+0-64`($ctx) - lea (%rax,%rax,4),%eax # *5 - mov %edx,`16*5+4-64`($ctx) - lea (%rdx,%rdx,4),%edx # *5 - mov %eax,`16*6+0-64`($ctx) - shr \$26,$d1 - mov %edx,`16*6+4-64`($ctx) - shr \$26,$d2 - - mov $h2,%rax - shl \$24,%rax - or %rax,$d1 - mov $d1#d,`16*7+0-64`($ctx) - lea ($d1,$d1,4),$d1 # *5 - mov $d2#d,`16*7+4-64`($ctx) - lea ($d2,$d2,4),$d2 # *5 - mov $d1#d,`16*8+0-64`($ctx) - mov $d2#d,`16*8+4-64`($ctx) - - mov $r1,%rax - call __poly1305_block # r^3 - - mov \$0x3ffffff,%eax # save r^3 base 2^26 - mov $h0,$d1 - and $h0#d,%eax - shr \$26,$d1 - mov %eax,`16*0+12-64`($ctx) - - mov \$0x3ffffff,%edx - and $d1#d,%edx - mov %edx,`16*1+12-64`($ctx) - lea (%rdx,%rdx,4),%edx # *5 - shr \$26,$d1 - mov %edx,`16*2+12-64`($ctx) - - mov $h1,%rax - shl \$12,%rax - or $d1,%rax - and \$0x3ffffff,%eax - mov %eax,`16*3+12-64`($ctx) - lea (%rax,%rax,4),%eax # *5 - mov $h1,$d1 - mov %eax,`16*4+12-64`($ctx) - - mov \$0x3ffffff,%edx - shr \$14,$d1 - and $d1#d,%edx - mov %edx,`16*5+12-64`($ctx) - lea (%rdx,%rdx,4),%edx # *5 - shr \$26,$d1 - mov %edx,`16*6+12-64`($ctx) - - mov $h2,%rax - shl \$24,%rax - or %rax,$d1 - mov $d1#d,`16*7+12-64`($ctx) - lea ($d1,$d1,4),$d1 # *5 - mov $d1#d,`16*8+12-64`($ctx) - - mov $r1,%rax - call __poly1305_block # r^4 - - mov \$0x3ffffff,%eax # save r^4 base 2^26 - mov $h0,$d1 - and $h0#d,%eax - shr \$26,$d1 - mov %eax,`16*0+8-64`($ctx) - - mov \$0x3ffffff,%edx - and $d1#d,%edx - mov %edx,`16*1+8-64`($ctx) - lea (%rdx,%rdx,4),%edx # *5 - shr \$26,$d1 - mov %edx,`16*2+8-64`($ctx) - - mov $h1,%rax - shl \$12,%rax - or $d1,%rax - and \$0x3ffffff,%eax - mov %eax,`16*3+8-64`($ctx) - lea (%rax,%rax,4),%eax # *5 - mov $h1,$d1 - mov %eax,`16*4+8-64`($ctx) - - mov \$0x3ffffff,%edx - shr \$14,$d1 - and $d1#d,%edx - mov %edx,`16*5+8-64`($ctx) - lea (%rdx,%rdx,4),%edx # *5 - shr \$26,$d1 - mov %edx,`16*6+8-64`($ctx) - - mov $h2,%rax - shl \$24,%rax - or %rax,$d1 - mov $d1#d,`16*7+8-64`($ctx) - lea ($d1,$d1,4),$d1 # *5 - mov $d1#d,`16*8+8-64`($ctx) - - lea -48-64($ctx),$ctx # size [de-]optimization - ret -.size __poly1305_init_avx,.-__poly1305_init_avx - -.type poly1305_blocks_avx,\@function,4 -.align 32 -poly1305_blocks_avx: - mov 20($ctx),%r8d # is_base2_26 - cmp \$128,$len - jae .Lblocks_avx - test %r8d,%r8d - jz .Lblocks - -.Lblocks_avx: - and \$-16,$len - jz .Lno_data_avx - - vzeroupper - - test %r8d,%r8d - jz .Lbase2_64_avx - - test \$31,$len - jz .Leven_avx - - push %rbx - push %rbp - push %r12 - push %r13 - push %r14 - push %r15 -.Lblocks_avx_body: - - mov $len,%r15 # reassign $len - - mov 0($ctx),$d1 # load hash value - mov 8($ctx),$d2 - mov 16($ctx),$h2#d - - mov 24($ctx),$r0 # load r - mov 32($ctx),$s1 - - ################################# base 2^26 -> base 2^64 - mov $d1#d,$h0#d - and \$`-1*(1<<31)`,$d1 - mov $d2,$r1 # borrow $r1 - mov $d2#d,$h1#d - and \$`-1*(1<<31)`,$d2 - - shr \$6,$d1 - shl \$52,$r1 - add $d1,$h0 - shr \$12,$h1 - shr \$18,$d2 - add $r1,$h0 - adc $d2,$h1 - - mov $h2,$d1 - shl \$40,$d1 - shr \$24,$h2 - add $d1,$h1 - adc \$0,$h2 # can be partially reduced... - - mov \$-4,$d2 # ... so reduce - mov $h2,$d1 - and $h2,$d2 - shr \$2,$d1 - and \$3,$h2 - add $d2,$d1 # =*5 - add $d1,$h0 - adc \$0,$h1 - adc \$0,$h2 - - mov $s1,$r1 - mov $s1,%rax - shr \$2,$s1 - add $r1,$s1 # s1 = r1 + (r1 >> 2) - - add 0($inp),$h0 # accumulate input - adc 8($inp),$h1 - lea 16($inp),$inp - adc $padbit,$h2 - - call __poly1305_block - - test $padbit,$padbit # if $padbit is zero, - jz .Lstore_base2_64_avx # store hash in base 2^64 format - - ################################# base 2^64 -> base 2^26 - mov $h0,%rax - mov $h0,%rdx - shr \$52,$h0 - mov $h1,$r0 - mov $h1,$r1 - shr \$26,%rdx - and \$0x3ffffff,%rax # h[0] - shl \$12,$r0 - and \$0x3ffffff,%rdx # h[1] - shr \$14,$h1 - or $r0,$h0 - shl \$24,$h2 - and \$0x3ffffff,$h0 # h[2] - shr \$40,$r1 - and \$0x3ffffff,$h1 # h[3] - or $r1,$h2 # h[4] - - sub \$16,%r15 - jz .Lstore_base2_26_avx - - vmovd %rax#d,$H0 - vmovd %rdx#d,$H1 - vmovd $h0#d,$H2 - vmovd $h1#d,$H3 - vmovd $h2#d,$H4 - jmp .Lproceed_avx - -.align 32 -.Lstore_base2_64_avx: - mov $h0,0($ctx) - mov $h1,8($ctx) - mov $h2,16($ctx) # note that is_base2_26 is zeroed - jmp .Ldone_avx - -.align 16 -.Lstore_base2_26_avx: - mov %rax#d,0($ctx) # store hash value base 2^26 - mov %rdx#d,4($ctx) - mov $h0#d,8($ctx) - mov $h1#d,12($ctx) - mov $h2#d,16($ctx) -.align 16 -.Ldone_avx: - mov 0(%rsp),%r15 - mov 8(%rsp),%r14 - mov 16(%rsp),%r13 - mov 24(%rsp),%r12 - mov 32(%rsp),%rbp - mov 40(%rsp),%rbx - lea 48(%rsp),%rsp -.Lno_data_avx: -.Lblocks_avx_epilogue: - ret - -.align 32 -.Lbase2_64_avx: - push %rbx - push %rbp - push %r12 - push %r13 - push %r14 - push %r15 -.Lbase2_64_avx_body: - - mov $len,%r15 # reassign $len - - mov 24($ctx),$r0 # load r - mov 32($ctx),$s1 - - mov 0($ctx),$h0 # load hash value - mov 8($ctx),$h1 - mov 16($ctx),$h2#d - - mov $s1,$r1 - mov $s1,%rax - shr \$2,$s1 - add $r1,$s1 # s1 = r1 + (r1 >> 2) - - test \$31,$len - jz .Linit_avx - - add 0($inp),$h0 # accumulate input - adc 8($inp),$h1 - lea 16($inp),$inp - adc $padbit,$h2 - sub \$16,%r15 - - call __poly1305_block - -.Linit_avx: - ################################# base 2^64 -> base 2^26 - mov $h0,%rax - mov $h0,%rdx - shr \$52,$h0 - mov $h1,$d1 - mov $h1,$d2 - shr \$26,%rdx - and \$0x3ffffff,%rax # h[0] - shl \$12,$d1 - and \$0x3ffffff,%rdx # h[1] - shr \$14,$h1 - or $d1,$h0 - shl \$24,$h2 - and \$0x3ffffff,$h0 # h[2] - shr \$40,$d2 - and \$0x3ffffff,$h1 # h[3] - or $d2,$h2 # h[4] - - vmovd %rax#d,$H0 - vmovd %rdx#d,$H1 - vmovd $h0#d,$H2 - vmovd $h1#d,$H3 - vmovd $h2#d,$H4 - movl \$1,20($ctx) # set is_base2_26 - - call __poly1305_init_avx - -.Lproceed_avx: - mov %r15,$len - - mov 0(%rsp),%r15 - mov 8(%rsp),%r14 - mov 16(%rsp),%r13 - mov 24(%rsp),%r12 - mov 32(%rsp),%rbp - mov 40(%rsp),%rbx - lea 48(%rsp),%rax - lea 48(%rsp),%rsp -.Lbase2_64_avx_epilogue: - jmp .Ldo_avx - -.align 32 -.Leven_avx: - vmovd 4*0($ctx),$H0 # load hash value - vmovd 4*1($ctx),$H1 - vmovd 4*2($ctx),$H2 - vmovd 4*3($ctx),$H3 - vmovd 4*4($ctx),$H4 - -.Ldo_avx: -___ -$code.=<<___ if (!$win64); - lea -0x58(%rsp),%r11 - sub \$0x178,%rsp -___ -$code.=<<___ if ($win64); - lea -0xf8(%rsp),%r11 - sub \$0x218,%rsp - vmovdqa %xmm6,0x50(%r11) - vmovdqa %xmm7,0x60(%r11) - vmovdqa %xmm8,0x70(%r11) - vmovdqa %xmm9,0x80(%r11) - vmovdqa %xmm10,0x90(%r11) - vmovdqa %xmm11,0xa0(%r11) - vmovdqa %xmm12,0xb0(%r11) - vmovdqa %xmm13,0xc0(%r11) - vmovdqa %xmm14,0xd0(%r11) - vmovdqa %xmm15,0xe0(%r11) -.Ldo_avx_body: -___ -$code.=<<___; - sub \$64,$len - lea -32($inp),%rax - cmovc %rax,$inp - - vmovdqu `16*3`($ctx),$D4 # preload r0^2 - lea `16*3+64`($ctx),$ctx # size optimization - lea .Lconst(%rip),%rcx - - ################################################################ - # load input - vmovdqu 16*2($inp),$T0 - vmovdqu 16*3($inp),$T1 - vmovdqa 64(%rcx),$MASK # .Lmask26 - - vpsrldq \$6,$T0,$T2 # splat input - vpsrldq \$6,$T1,$T3 - vpunpckhqdq $T1,$T0,$T4 # 4 - vpunpcklqdq $T1,$T0,$T0 # 0:1 - vpunpcklqdq $T3,$T2,$T3 # 2:3 - - vpsrlq \$40,$T4,$T4 # 4 - vpsrlq \$26,$T0,$T1 - vpand $MASK,$T0,$T0 # 0 - vpsrlq \$4,$T3,$T2 - vpand $MASK,$T1,$T1 # 1 - vpsrlq \$30,$T3,$T3 - vpand $MASK,$T2,$T2 # 2 - vpand $MASK,$T3,$T3 # 3 - vpor 32(%rcx),$T4,$T4 # padbit, yes, always - - jbe .Lskip_loop_avx - - # expand and copy pre-calculated table to stack - vmovdqu `16*1-64`($ctx),$D1 - vmovdqu `16*2-64`($ctx),$D2 - vpshufd \$0xEE,$D4,$D3 # 34xx -> 3434 - vpshufd \$0x44,$D4,$D0 # xx12 -> 1212 - vmovdqa $D3,-0x90(%r11) - vmovdqa $D0,0x00(%rsp) - vpshufd \$0xEE,$D1,$D4 - vmovdqu `16*3-64`($ctx),$D0 - vpshufd \$0x44,$D1,$D1 - vmovdqa $D4,-0x80(%r11) - vmovdqa $D1,0x10(%rsp) - vpshufd \$0xEE,$D2,$D3 - vmovdqu `16*4-64`($ctx),$D1 - vpshufd \$0x44,$D2,$D2 - vmovdqa $D3,-0x70(%r11) - vmovdqa $D2,0x20(%rsp) - vpshufd \$0xEE,$D0,$D4 - vmovdqu `16*5-64`($ctx),$D2 - vpshufd \$0x44,$D0,$D0 - vmovdqa $D4,-0x60(%r11) - vmovdqa $D0,0x30(%rsp) - vpshufd \$0xEE,$D1,$D3 - vmovdqu `16*6-64`($ctx),$D0 - vpshufd \$0x44,$D1,$D1 - vmovdqa $D3,-0x50(%r11) - vmovdqa $D1,0x40(%rsp) - vpshufd \$0xEE,$D2,$D4 - vmovdqu `16*7-64`($ctx),$D1 - vpshufd \$0x44,$D2,$D2 - vmovdqa $D4,-0x40(%r11) - vmovdqa $D2,0x50(%rsp) - vpshufd \$0xEE,$D0,$D3 - vmovdqu `16*8-64`($ctx),$D2 - vpshufd \$0x44,$D0,$D0 - vmovdqa $D3,-0x30(%r11) - vmovdqa $D0,0x60(%rsp) - vpshufd \$0xEE,$D1,$D4 - vpshufd \$0x44,$D1,$D1 - vmovdqa $D4,-0x20(%r11) - vmovdqa $D1,0x70(%rsp) - vpshufd \$0xEE,$D2,$D3 - vmovdqa 0x00(%rsp),$D4 # preload r0^2 - vpshufd \$0x44,$D2,$D2 - vmovdqa $D3,-0x10(%r11) - vmovdqa $D2,0x80(%rsp) - - jmp .Loop_avx - -.align 32 -.Loop_avx: - ################################################################ - # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 - # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r - # \___________________/ - # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 - # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r - # \___________________/ \____________________/ - # - # Note that we start with inp[2:3]*r^2. This is because it - # doesn't depend on reduction in previous iteration. - ################################################################ - # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 - # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 - # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 - # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 - # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 - # - # though note that $Tx and $Hx are "reversed" in this section, - # and $D4 is preloaded with r0^2... - - vpmuludq $T0,$D4,$D0 # d0 = h0*r0 - vpmuludq $T1,$D4,$D1 # d1 = h1*r0 - vmovdqa $H2,0x20(%r11) # offload hash - vpmuludq $T2,$D4,$D2 # d3 = h2*r0 - vmovdqa 0x10(%rsp),$H2 # r1^2 - vpmuludq $T3,$D4,$D3 # d3 = h3*r0 - vpmuludq $T4,$D4,$D4 # d4 = h4*r0 - - vmovdqa $H0,0x00(%r11) # - vpmuludq 0x20(%rsp),$T4,$H0 # h4*s1 - vmovdqa $H1,0x10(%r11) # - vpmuludq $T3,$H2,$H1 # h3*r1 - vpaddq $H0,$D0,$D0 # d0 += h4*s1 - vpaddq $H1,$D4,$D4 # d4 += h3*r1 - vmovdqa $H3,0x30(%r11) # - vpmuludq $T2,$H2,$H0 # h2*r1 - vpmuludq $T1,$H2,$H1 # h1*r1 - vpaddq $H0,$D3,$D3 # d3 += h2*r1 - vmovdqa 0x30(%rsp),$H3 # r2^2 - vpaddq $H1,$D2,$D2 # d2 += h1*r1 - vmovdqa $H4,0x40(%r11) # - vpmuludq $T0,$H2,$H2 # h0*r1 - vpmuludq $T2,$H3,$H0 # h2*r2 - vpaddq $H2,$D1,$D1 # d1 += h0*r1 - - vmovdqa 0x40(%rsp),$H4 # s2^2 - vpaddq $H0,$D4,$D4 # d4 += h2*r2 - vpmuludq $T1,$H3,$H1 # h1*r2 - vpmuludq $T0,$H3,$H3 # h0*r2 - vpaddq $H1,$D3,$D3 # d3 += h1*r2 - vmovdqa 0x50(%rsp),$H2 # r3^2 - vpaddq $H3,$D2,$D2 # d2 += h0*r2 - vpmuludq $T4,$H4,$H0 # h4*s2 - vpmuludq $T3,$H4,$H4 # h3*s2 - vpaddq $H0,$D1,$D1 # d1 += h4*s2 - vmovdqa 0x60(%rsp),$H3 # s3^2 - vpaddq $H4,$D0,$D0 # d0 += h3*s2 - - vmovdqa 0x80(%rsp),$H4 # s4^2 - vpmuludq $T1,$H2,$H1 # h1*r3 - vpmuludq $T0,$H2,$H2 # h0*r3 - vpaddq $H1,$D4,$D4 # d4 += h1*r3 - vpaddq $H2,$D3,$D3 # d3 += h0*r3 - vpmuludq $T4,$H3,$H0 # h4*s3 - vpmuludq $T3,$H3,$H1 # h3*s3 - vpaddq $H0,$D2,$D2 # d2 += h4*s3 - vmovdqu 16*0($inp),$H0 # load input - vpaddq $H1,$D1,$D1 # d1 += h3*s3 - vpmuludq $T2,$H3,$H3 # h2*s3 - vpmuludq $T2,$H4,$T2 # h2*s4 - vpaddq $H3,$D0,$D0 # d0 += h2*s3 - - vmovdqu 16*1($inp),$H1 # - vpaddq $T2,$D1,$D1 # d1 += h2*s4 - vpmuludq $T3,$H4,$T3 # h3*s4 - vpmuludq $T4,$H4,$T4 # h4*s4 - vpsrldq \$6,$H0,$H2 # splat input - vpaddq $T3,$D2,$D2 # d2 += h3*s4 - vpaddq $T4,$D3,$D3 # d3 += h4*s4 - vpsrldq \$6,$H1,$H3 # - vpmuludq 0x70(%rsp),$T0,$T4 # h0*r4 - vpmuludq $T1,$H4,$T0 # h1*s4 - vpunpckhqdq $H1,$H0,$H4 # 4 - vpaddq $T4,$D4,$D4 # d4 += h0*r4 - vmovdqa -0x90(%r11),$T4 # r0^4 - vpaddq $T0,$D0,$D0 # d0 += h1*s4 - - vpunpcklqdq $H1,$H0,$H0 # 0:1 - vpunpcklqdq $H3,$H2,$H3 # 2:3 - - #vpsrlq \$40,$H4,$H4 # 4 - vpsrldq \$`40/8`,$H4,$H4 # 4 - vpsrlq \$26,$H0,$H1 - vpand $MASK,$H0,$H0 # 0 - vpsrlq \$4,$H3,$H2 - vpand $MASK,$H1,$H1 # 1 - vpand 0(%rcx),$H4,$H4 # .Lmask24 - vpsrlq \$30,$H3,$H3 - vpand $MASK,$H2,$H2 # 2 - vpand $MASK,$H3,$H3 # 3 - vpor 32(%rcx),$H4,$H4 # padbit, yes, always - - vpaddq 0x00(%r11),$H0,$H0 # add hash value - vpaddq 0x10(%r11),$H1,$H1 - vpaddq 0x20(%r11),$H2,$H2 - vpaddq 0x30(%r11),$H3,$H3 - vpaddq 0x40(%r11),$H4,$H4 - - lea 16*2($inp),%rax - lea 16*4($inp),$inp - sub \$64,$len - cmovc %rax,$inp - - ################################################################ - # Now we accumulate (inp[0:1]+hash)*r^4 - ################################################################ - # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 - # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 - # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 - # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 - # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 - - vpmuludq $H0,$T4,$T0 # h0*r0 - vpmuludq $H1,$T4,$T1 # h1*r0 - vpaddq $T0,$D0,$D0 - vpaddq $T1,$D1,$D1 - vmovdqa -0x80(%r11),$T2 # r1^4 - vpmuludq $H2,$T4,$T0 # h2*r0 - vpmuludq $H3,$T4,$T1 # h3*r0 - vpaddq $T0,$D2,$D2 - vpaddq $T1,$D3,$D3 - vpmuludq $H4,$T4,$T4 # h4*r0 - vpmuludq -0x70(%r11),$H4,$T0 # h4*s1 - vpaddq $T4,$D4,$D4 - - vpaddq $T0,$D0,$D0 # d0 += h4*s1 - vpmuludq $H2,$T2,$T1 # h2*r1 - vpmuludq $H3,$T2,$T0 # h3*r1 - vpaddq $T1,$D3,$D3 # d3 += h2*r1 - vmovdqa -0x60(%r11),$T3 # r2^4 - vpaddq $T0,$D4,$D4 # d4 += h3*r1 - vpmuludq $H1,$T2,$T1 # h1*r1 - vpmuludq $H0,$T2,$T2 # h0*r1 - vpaddq $T1,$D2,$D2 # d2 += h1*r1 - vpaddq $T2,$D1,$D1 # d1 += h0*r1 - - vmovdqa -0x50(%r11),$T4 # s2^4 - vpmuludq $H2,$T3,$T0 # h2*r2 - vpmuludq $H1,$T3,$T1 # h1*r2 - vpaddq $T0,$D4,$D4 # d4 += h2*r2 - vpaddq $T1,$D3,$D3 # d3 += h1*r2 - vmovdqa -0x40(%r11),$T2 # r3^4 - vpmuludq $H0,$T3,$T3 # h0*r2 - vpmuludq $H4,$T4,$T0 # h4*s2 - vpaddq $T3,$D2,$D2 # d2 += h0*r2 - vpaddq $T0,$D1,$D1 # d1 += h4*s2 - vmovdqa -0x30(%r11),$T3 # s3^4 - vpmuludq $H3,$T4,$T4 # h3*s2 - vpmuludq $H1,$T2,$T1 # h1*r3 - vpaddq $T4,$D0,$D0 # d0 += h3*s2 - - vmovdqa -0x10(%r11),$T4 # s4^4 - vpaddq $T1,$D4,$D4 # d4 += h1*r3 - vpmuludq $H0,$T2,$T2 # h0*r3 - vpmuludq $H4,$T3,$T0 # h4*s3 - vpaddq $T2,$D3,$D3 # d3 += h0*r3 - vpaddq $T0,$D2,$D2 # d2 += h4*s3 - vmovdqu 16*2($inp),$T0 # load input - vpmuludq $H3,$T3,$T2 # h3*s3 - vpmuludq $H2,$T3,$T3 # h2*s3 - vpaddq $T2,$D1,$D1 # d1 += h3*s3 - vmovdqu 16*3($inp),$T1 # - vpaddq $T3,$D0,$D0 # d0 += h2*s3 - - vpmuludq $H2,$T4,$H2 # h2*s4 - vpmuludq $H3,$T4,$H3 # h3*s4 - vpsrldq \$6,$T0,$T2 # splat input - vpaddq $H2,$D1,$D1 # d1 += h2*s4 - vpmuludq $H4,$T4,$H4 # h4*s4 - vpsrldq \$6,$T1,$T3 # - vpaddq $H3,$D2,$H2 # h2 = d2 + h3*s4 - vpaddq $H4,$D3,$H3 # h3 = d3 + h4*s4 - vpmuludq -0x20(%r11),$H0,$H4 # h0*r4 - vpmuludq $H1,$T4,$H0 - vpunpckhqdq $T1,$T0,$T4 # 4 - vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4 - vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4 - - vpunpcklqdq $T1,$T0,$T0 # 0:1 - vpunpcklqdq $T3,$T2,$T3 # 2:3 - - #vpsrlq \$40,$T4,$T4 # 4 - vpsrldq \$`40/8`,$T4,$T4 # 4 - vpsrlq \$26,$T0,$T1 - vmovdqa 0x00(%rsp),$D4 # preload r0^2 - vpand $MASK,$T0,$T0 # 0 - vpsrlq \$4,$T3,$T2 - vpand $MASK,$T1,$T1 # 1 - vpand 0(%rcx),$T4,$T4 # .Lmask24 - vpsrlq \$30,$T3,$T3 - vpand $MASK,$T2,$T2 # 2 - vpand $MASK,$T3,$T3 # 3 - vpor 32(%rcx),$T4,$T4 # padbit, yes, always - - ################################################################ - # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein - # and P. Schwabe - - vpsrlq \$26,$H3,$D3 - vpand $MASK,$H3,$H3 - vpaddq $D3,$H4,$H4 # h3 -> h4 - - vpsrlq \$26,$H0,$D0 - vpand $MASK,$H0,$H0 - vpaddq $D0,$D1,$H1 # h0 -> h1 - - vpsrlq \$26,$H4,$D0 - vpand $MASK,$H4,$H4 - - vpsrlq \$26,$H1,$D1 - vpand $MASK,$H1,$H1 - vpaddq $D1,$H2,$H2 # h1 -> h2 - - vpaddq $D0,$H0,$H0 - vpsllq \$2,$D0,$D0 - vpaddq $D0,$H0,$H0 # h4 -> h0 - - vpsrlq \$26,$H2,$D2 - vpand $MASK,$H2,$H2 - vpaddq $D2,$H3,$H3 # h2 -> h3 - - vpsrlq \$26,$H0,$D0 - vpand $MASK,$H0,$H0 - vpaddq $D0,$H1,$H1 # h0 -> h1 - - vpsrlq \$26,$H3,$D3 - vpand $MASK,$H3,$H3 - vpaddq $D3,$H4,$H4 # h3 -> h4 - - ja .Loop_avx - -.Lskip_loop_avx: - ################################################################ - # multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 - - vpshufd \$0x10,$D4,$D4 # r0^n, xx12 -> x1x2 - add \$32,$len - jnz .Long_tail_avx - - vpaddq $H2,$T2,$T2 - vpaddq $H0,$T0,$T0 - vpaddq $H1,$T1,$T1 - vpaddq $H3,$T3,$T3 - vpaddq $H4,$T4,$T4 - -.Long_tail_avx: - vmovdqa $H2,0x20(%r11) - vmovdqa $H0,0x00(%r11) - vmovdqa $H1,0x10(%r11) - vmovdqa $H3,0x30(%r11) - vmovdqa $H4,0x40(%r11) - - # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 - # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 - # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 - # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 - # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 - - vpmuludq $T2,$D4,$D2 # d2 = h2*r0 - vpmuludq $T0,$D4,$D0 # d0 = h0*r0 - vpshufd \$0x10,`16*1-64`($ctx),$H2 # r1^n - vpmuludq $T1,$D4,$D1 # d1 = h1*r0 - vpmuludq $T3,$D4,$D3 # d3 = h3*r0 - vpmuludq $T4,$D4,$D4 # d4 = h4*r0 - - vpmuludq $T3,$H2,$H0 # h3*r1 - vpaddq $H0,$D4,$D4 # d4 += h3*r1 - vpshufd \$0x10,`16*2-64`($ctx),$H3 # s1^n - vpmuludq $T2,$H2,$H1 # h2*r1 - vpaddq $H1,$D3,$D3 # d3 += h2*r1 - vpshufd \$0x10,`16*3-64`($ctx),$H4 # r2^n - vpmuludq $T1,$H2,$H0 # h1*r1 - vpaddq $H0,$D2,$D2 # d2 += h1*r1 - vpmuludq $T0,$H2,$H2 # h0*r1 - vpaddq $H2,$D1,$D1 # d1 += h0*r1 - vpmuludq $T4,$H3,$H3 # h4*s1 - vpaddq $H3,$D0,$D0 # d0 += h4*s1 - - vpshufd \$0x10,`16*4-64`($ctx),$H2 # s2^n - vpmuludq $T2,$H4,$H1 # h2*r2 - vpaddq $H1,$D4,$D4 # d4 += h2*r2 - vpmuludq $T1,$H4,$H0 # h1*r2 - vpaddq $H0,$D3,$D3 # d3 += h1*r2 - vpshufd \$0x10,`16*5-64`($ctx),$H3 # r3^n - vpmuludq $T0,$H4,$H4 # h0*r2 - vpaddq $H4,$D2,$D2 # d2 += h0*r2 - vpmuludq $T4,$H2,$H1 # h4*s2 - vpaddq $H1,$D1,$D1 # d1 += h4*s2 - vpshufd \$0x10,`16*6-64`($ctx),$H4 # s3^n - vpmuludq $T3,$H2,$H2 # h3*s2 - vpaddq $H2,$D0,$D0 # d0 += h3*s2 - - vpmuludq $T1,$H3,$H0 # h1*r3 - vpaddq $H0,$D4,$D4 # d4 += h1*r3 - vpmuludq $T0,$H3,$H3 # h0*r3 - vpaddq $H3,$D3,$D3 # d3 += h0*r3 - vpshufd \$0x10,`16*7-64`($ctx),$H2 # r4^n - vpmuludq $T4,$H4,$H1 # h4*s3 - vpaddq $H1,$D2,$D2 # d2 += h4*s3 - vpshufd \$0x10,`16*8-64`($ctx),$H3 # s4^n - vpmuludq $T3,$H4,$H0 # h3*s3 - vpaddq $H0,$D1,$D1 # d1 += h3*s3 - vpmuludq $T2,$H4,$H4 # h2*s3 - vpaddq $H4,$D0,$D0 # d0 += h2*s3 - - vpmuludq $T0,$H2,$H2 # h0*r4 - vpaddq $H2,$D4,$D4 # h4 = d4 + h0*r4 - vpmuludq $T4,$H3,$H1 # h4*s4 - vpaddq $H1,$D3,$D3 # h3 = d3 + h4*s4 - vpmuludq $T3,$H3,$H0 # h3*s4 - vpaddq $H0,$D2,$D2 # h2 = d2 + h3*s4 - vpmuludq $T2,$H3,$H1 # h2*s4 - vpaddq $H1,$D1,$D1 # h1 = d1 + h2*s4 - vpmuludq $T1,$H3,$H3 # h1*s4 - vpaddq $H3,$D0,$D0 # h0 = d0 + h1*s4 - - jz .Lshort_tail_avx - - vmovdqu 16*0($inp),$H0 # load input - vmovdqu 16*1($inp),$H1 - - vpsrldq \$6,$H0,$H2 # splat input - vpsrldq \$6,$H1,$H3 - vpunpckhqdq $H1,$H0,$H4 # 4 - vpunpcklqdq $H1,$H0,$H0 # 0:1 - vpunpcklqdq $H3,$H2,$H3 # 2:3 - - vpsrlq \$40,$H4,$H4 # 4 - vpsrlq \$26,$H0,$H1 - vpand $MASK,$H0,$H0 # 0 - vpsrlq \$4,$H3,$H2 - vpand $MASK,$H1,$H1 # 1 - vpsrlq \$30,$H3,$H3 - vpand $MASK,$H2,$H2 # 2 - vpand $MASK,$H3,$H3 # 3 - vpor 32(%rcx),$H4,$H4 # padbit, yes, always - - vpshufd \$0x32,`16*0-64`($ctx),$T4 # r0^n, 34xx -> x3x4 - vpaddq 0x00(%r11),$H0,$H0 - vpaddq 0x10(%r11),$H1,$H1 - vpaddq 0x20(%r11),$H2,$H2 - vpaddq 0x30(%r11),$H3,$H3 - vpaddq 0x40(%r11),$H4,$H4 - - ################################################################ - # multiply (inp[0:1]+hash) by r^4:r^3 and accumulate - - vpmuludq $H0,$T4,$T0 # h0*r0 - vpaddq $T0,$D0,$D0 # d0 += h0*r0 - vpmuludq $H1,$T4,$T1 # h1*r0 - vpaddq $T1,$D1,$D1 # d1 += h1*r0 - vpmuludq $H2,$T4,$T0 # h2*r0 - vpaddq $T0,$D2,$D2 # d2 += h2*r0 - vpshufd \$0x32,`16*1-64`($ctx),$T2 # r1^n - vpmuludq $H3,$T4,$T1 # h3*r0 - vpaddq $T1,$D3,$D3 # d3 += h3*r0 - vpmuludq $H4,$T4,$T4 # h4*r0 - vpaddq $T4,$D4,$D4 # d4 += h4*r0 - - vpmuludq $H3,$T2,$T0 # h3*r1 - vpaddq $T0,$D4,$D4 # d4 += h3*r1 - vpshufd \$0x32,`16*2-64`($ctx),$T3 # s1 - vpmuludq $H2,$T2,$T1 # h2*r1 - vpaddq $T1,$D3,$D3 # d3 += h2*r1 - vpshufd \$0x32,`16*3-64`($ctx),$T4 # r2 - vpmuludq $H1,$T2,$T0 # h1*r1 - vpaddq $T0,$D2,$D2 # d2 += h1*r1 - vpmuludq $H0,$T2,$T2 # h0*r1 - vpaddq $T2,$D1,$D1 # d1 += h0*r1 - vpmuludq $H4,$T3,$T3 # h4*s1 - vpaddq $T3,$D0,$D0 # d0 += h4*s1 - - vpshufd \$0x32,`16*4-64`($ctx),$T2 # s2 - vpmuludq $H2,$T4,$T1 # h2*r2 - vpaddq $T1,$D4,$D4 # d4 += h2*r2 - vpmuludq $H1,$T4,$T0 # h1*r2 - vpaddq $T0,$D3,$D3 # d3 += h1*r2 - vpshufd \$0x32,`16*5-64`($ctx),$T3 # r3 - vpmuludq $H0,$T4,$T4 # h0*r2 - vpaddq $T4,$D2,$D2 # d2 += h0*r2 - vpmuludq $H4,$T2,$T1 # h4*s2 - vpaddq $T1,$D1,$D1 # d1 += h4*s2 - vpshufd \$0x32,`16*6-64`($ctx),$T4 # s3 - vpmuludq $H3,$T2,$T2 # h3*s2 - vpaddq $T2,$D0,$D0 # d0 += h3*s2 - - vpmuludq $H1,$T3,$T0 # h1*r3 - vpaddq $T0,$D4,$D4 # d4 += h1*r3 - vpmuludq $H0,$T3,$T3 # h0*r3 - vpaddq $T3,$D3,$D3 # d3 += h0*r3 - vpshufd \$0x32,`16*7-64`($ctx),$T2 # r4 - vpmuludq $H4,$T4,$T1 # h4*s3 - vpaddq $T1,$D2,$D2 # d2 += h4*s3 - vpshufd \$0x32,`16*8-64`($ctx),$T3 # s4 - vpmuludq $H3,$T4,$T0 # h3*s3 - vpaddq $T0,$D1,$D1 # d1 += h3*s3 - vpmuludq $H2,$T4,$T4 # h2*s3 - vpaddq $T4,$D0,$D0 # d0 += h2*s3 - - vpmuludq $H0,$T2,$T2 # h0*r4 - vpaddq $T2,$D4,$D4 # d4 += h0*r4 - vpmuludq $H4,$T3,$T1 # h4*s4 - vpaddq $T1,$D3,$D3 # d3 += h4*s4 - vpmuludq $H3,$T3,$T0 # h3*s4 - vpaddq $T0,$D2,$D2 # d2 += h3*s4 - vpmuludq $H2,$T3,$T1 # h2*s4 - vpaddq $T1,$D1,$D1 # d1 += h2*s4 - vpmuludq $H1,$T3,$T3 # h1*s4 - vpaddq $T3,$D0,$D0 # d0 += h1*s4 - -.Lshort_tail_avx: - ################################################################ - # horizontal addition - - vpsrldq \$8,$D4,$T4 - vpsrldq \$8,$D3,$T3 - vpsrldq \$8,$D1,$T1 - vpsrldq \$8,$D0,$T0 - vpsrldq \$8,$D2,$T2 - vpaddq $T3,$D3,$D3 - vpaddq $T4,$D4,$D4 - vpaddq $T0,$D0,$D0 - vpaddq $T1,$D1,$D1 - vpaddq $T2,$D2,$D2 - - ################################################################ - # lazy reduction - - vpsrlq \$26,$D3,$H3 - vpand $MASK,$D3,$D3 - vpaddq $H3,$D4,$D4 # h3 -> h4 - - vpsrlq \$26,$D0,$H0 - vpand $MASK,$D0,$D0 - vpaddq $H0,$D1,$D1 # h0 -> h1 - - vpsrlq \$26,$D4,$H4 - vpand $MASK,$D4,$D4 - - vpsrlq \$26,$D1,$H1 - vpand $MASK,$D1,$D1 - vpaddq $H1,$D2,$D2 # h1 -> h2 - - vpaddq $H4,$D0,$D0 - vpsllq \$2,$H4,$H4 - vpaddq $H4,$D0,$D0 # h4 -> h0 - - vpsrlq \$26,$D2,$H2 - vpand $MASK,$D2,$D2 - vpaddq $H2,$D3,$D3 # h2 -> h3 - - vpsrlq \$26,$D0,$H0 - vpand $MASK,$D0,$D0 - vpaddq $H0,$D1,$D1 # h0 -> h1 - - vpsrlq \$26,$D3,$H3 - vpand $MASK,$D3,$D3 - vpaddq $H3,$D4,$D4 # h3 -> h4 - - vmovd $D0,`4*0-48-64`($ctx) # save partially reduced - vmovd $D1,`4*1-48-64`($ctx) - vmovd $D2,`4*2-48-64`($ctx) - vmovd $D3,`4*3-48-64`($ctx) - vmovd $D4,`4*4-48-64`($ctx) -___ -$code.=<<___ if ($win64); - vmovdqa 0x50(%r11),%xmm6 - vmovdqa 0x60(%r11),%xmm7 - vmovdqa 0x70(%r11),%xmm8 - vmovdqa 0x80(%r11),%xmm9 - vmovdqa 0x90(%r11),%xmm10 - vmovdqa 0xa0(%r11),%xmm11 - vmovdqa 0xb0(%r11),%xmm12 - vmovdqa 0xc0(%r11),%xmm13 - vmovdqa 0xd0(%r11),%xmm14 - vmovdqa 0xe0(%r11),%xmm15 - lea 0xf8(%r11),%rsp -.Ldo_avx_epilogue: -___ -$code.=<<___ if (!$win64); - lea 0x58(%r11),%rsp -___ -$code.=<<___; - vzeroupper - ret -.size poly1305_blocks_avx,.-poly1305_blocks_avx - -.type poly1305_emit_avx,\@function,3 -.align 32 -poly1305_emit_avx: - cmpl \$0,20($ctx) # is_base2_26? - je .Lemit - - mov 0($ctx),%eax # load hash value base 2^26 - mov 4($ctx),%ecx - mov 8($ctx),%r8d - mov 12($ctx),%r11d - mov 16($ctx),%r10d - - shl \$26,%rcx # base 2^26 -> base 2^64 - mov %r8,%r9 - shl \$52,%r8 - add %rcx,%rax - shr \$12,%r9 - add %rax,%r8 # h0 - adc \$0,%r9 - - shl \$14,%r11 - mov %r10,%rax - shr \$24,%r10 - add %r11,%r9 - shl \$40,%rax - add %rax,%r9 # h1 - adc \$0,%r10 # h2 - - mov %r10,%rax # could be partially reduced, so reduce - mov %r10,%rcx - and \$3,%r10 - shr \$2,%rax - and \$-4,%rcx - add %rcx,%rax - add %rax,%r8 - adc \$0,%r9 - adc \$0,%r10 - - mov %r8,%rax - add \$5,%r8 # compare to modulus - mov %r9,%rcx - adc \$0,%r9 - adc \$0,%r10 - shr \$2,%r10 # did 130-bit value overfow? - cmovnz %r8,%rax - cmovnz %r9,%rcx - - add 0($nonce),%rax # accumulate nonce - adc 8($nonce),%rcx - mov %rax,0($mac) # write result - mov %rcx,8($mac) - - ret -.size poly1305_emit_avx,.-poly1305_emit_avx -___ - -if ($avx>1) { -my ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) = - map("%ymm$_",(0..15)); -my $S4=$MASK; - -$code.=<<___; -.type poly1305_blocks_avx2,\@function,4 -.align 32 -poly1305_blocks_avx2: - mov 20($ctx),%r8d # is_base2_26 - cmp \$128,$len - jae .Lblocks_avx2 - test %r8d,%r8d - jz .Lblocks - -.Lblocks_avx2: - and \$-16,$len - jz .Lno_data_avx2 - - vzeroupper - - test %r8d,%r8d - jz .Lbase2_64_avx2 - - test \$63,$len - jz .Leven_avx2 - - push %rbx - push %rbp - push %r12 - push %r13 - push %r14 - push %r15 -.Lblocks_avx2_body: - - mov $len,%r15 # reassign $len - - mov 0($ctx),$d1 # load hash value - mov 8($ctx),$d2 - mov 16($ctx),$h2#d - - mov 24($ctx),$r0 # load r - mov 32($ctx),$s1 - - ################################# base 2^26 -> base 2^64 - mov $d1#d,$h0#d - and \$`-1*(1<<31)`,$d1 - mov $d2,$r1 # borrow $r1 - mov $d2#d,$h1#d - and \$`-1*(1<<31)`,$d2 - - shr \$6,$d1 - shl \$52,$r1 - add $d1,$h0 - shr \$12,$h1 - shr \$18,$d2 - add $r1,$h0 - adc $d2,$h1 - - mov $h2,$d1 - shl \$40,$d1 - shr \$24,$h2 - add $d1,$h1 - adc \$0,$h2 # can be partially reduced... - - mov \$-4,$d2 # ... so reduce - mov $h2,$d1 - and $h2,$d2 - shr \$2,$d1 - and \$3,$h2 - add $d2,$d1 # =*5 - add $d1,$h0 - adc \$0,$h1 - adc \$0,$h2 - - mov $s1,$r1 - mov $s1,%rax - shr \$2,$s1 - add $r1,$s1 # s1 = r1 + (r1 >> 2) - -.Lbase2_26_pre_avx2: - add 0($inp),$h0 # accumulate input - adc 8($inp),$h1 - lea 16($inp),$inp - adc $padbit,$h2 - sub \$16,%r15 - - call __poly1305_block - mov $r1,%rax - - test \$63,%r15 - jnz .Lbase2_26_pre_avx2 - - test $padbit,$padbit # if $padbit is zero, - jz .Lstore_base2_64_avx2 # store hash in base 2^64 format - - ################################# base 2^64 -> base 2^26 - mov $h0,%rax - mov $h0,%rdx - shr \$52,$h0 - mov $h1,$r0 - mov $h1,$r1 - shr \$26,%rdx - and \$0x3ffffff,%rax # h[0] - shl \$12,$r0 - and \$0x3ffffff,%rdx # h[1] - shr \$14,$h1 - or $r0,$h0 - shl \$24,$h2 - and \$0x3ffffff,$h0 # h[2] - shr \$40,$r1 - and \$0x3ffffff,$h1 # h[3] - or $r1,$h2 # h[4] - - test %r15,%r15 - jz .Lstore_base2_26_avx2 - - vmovd %rax#d,%x#$H0 - vmovd %rdx#d,%x#$H1 - vmovd $h0#d,%x#$H2 - vmovd $h1#d,%x#$H3 - vmovd $h2#d,%x#$H4 - jmp .Lproceed_avx2 - -.align 32 -.Lstore_base2_64_avx2: - mov $h0,0($ctx) - mov $h1,8($ctx) - mov $h2,16($ctx) # note that is_base2_26 is zeroed - jmp .Ldone_avx2 - -.align 16 -.Lstore_base2_26_avx2: - mov %rax#d,0($ctx) # store hash value base 2^26 - mov %rdx#d,4($ctx) - mov $h0#d,8($ctx) - mov $h1#d,12($ctx) - mov $h2#d,16($ctx) -.align 16 -.Ldone_avx2: - mov 0(%rsp),%r15 - mov 8(%rsp),%r14 - mov 16(%rsp),%r13 - mov 24(%rsp),%r12 - mov 32(%rsp),%rbp - mov 40(%rsp),%rbx - lea 48(%rsp),%rsp -.Lno_data_avx2: -.Lblocks_avx2_epilogue: - ret - -.align 32 -.Lbase2_64_avx2: - push %rbx - push %rbp - push %r12 - push %r13 - push %r14 - push %r15 -.Lbase2_64_avx2_body: - - mov $len,%r15 # reassign $len - - mov 24($ctx),$r0 # load r - mov 32($ctx),$s1 - - mov 0($ctx),$h0 # load hash value - mov 8($ctx),$h1 - mov 16($ctx),$h2#d - - mov $s1,$r1 - mov $s1,%rax - shr \$2,$s1 - add $r1,$s1 # s1 = r1 + (r1 >> 2) - - test \$63,$len - jz .Linit_avx2 - -.Lbase2_64_pre_avx2: - add 0($inp),$h0 # accumulate input - adc 8($inp),$h1 - lea 16($inp),$inp - adc $padbit,$h2 - sub \$16,%r15 - - call __poly1305_block - mov $r1,%rax - - test \$63,%r15 - jnz .Lbase2_64_pre_avx2 - -.Linit_avx2: - ################################# base 2^64 -> base 2^26 - mov $h0,%rax - mov $h0,%rdx - shr \$52,$h0 - mov $h1,$d1 - mov $h1,$d2 - shr \$26,%rdx - and \$0x3ffffff,%rax # h[0] - shl \$12,$d1 - and \$0x3ffffff,%rdx # h[1] - shr \$14,$h1 - or $d1,$h0 - shl \$24,$h2 - and \$0x3ffffff,$h0 # h[2] - shr \$40,$d2 - and \$0x3ffffff,$h1 # h[3] - or $d2,$h2 # h[4] - - vmovd %rax#d,%x#$H0 - vmovd %rdx#d,%x#$H1 - vmovd $h0#d,%x#$H2 - vmovd $h1#d,%x#$H3 - vmovd $h2#d,%x#$H4 - movl \$1,20($ctx) # set is_base2_26 - - call __poly1305_init_avx - -.Lproceed_avx2: - mov %r15,$len - - mov 0(%rsp),%r15 - mov 8(%rsp),%r14 - mov 16(%rsp),%r13 - mov 24(%rsp),%r12 - mov 32(%rsp),%rbp - mov 40(%rsp),%rbx - lea 48(%rsp),%rax - lea 48(%rsp),%rsp -.Lbase2_64_avx2_epilogue: - jmp .Ldo_avx2 - -.align 32 -.Leven_avx2: - vmovd 4*0($ctx),%x#$H0 # load hash value base 2^26 - vmovd 4*1($ctx),%x#$H1 - vmovd 4*2($ctx),%x#$H2 - vmovd 4*3($ctx),%x#$H3 - vmovd 4*4($ctx),%x#$H4 - -.Ldo_avx2: -___ -$code.=<<___ if (!$win64); - lea -8(%rsp),%r11 - sub \$0x128,%rsp -___ -$code.=<<___ if ($win64); - lea -0xf8(%rsp),%r11 - sub \$0x1c8,%rsp - vmovdqa %xmm6,0x50(%r11) - vmovdqa %xmm7,0x60(%r11) - vmovdqa %xmm8,0x70(%r11) - vmovdqa %xmm9,0x80(%r11) - vmovdqa %xmm10,0x90(%r11) - vmovdqa %xmm11,0xa0(%r11) - vmovdqa %xmm12,0xb0(%r11) - vmovdqa %xmm13,0xc0(%r11) - vmovdqa %xmm14,0xd0(%r11) - vmovdqa %xmm15,0xe0(%r11) -.Ldo_avx2_body: -___ -$code.=<<___; - lea 48+64($ctx),$ctx # size optimization - lea .Lconst(%rip),%rcx - - # expand and copy pre-calculated table to stack - vmovdqu `16*0-64`($ctx),%x#$T2 - and \$-512,%rsp - vmovdqu `16*1-64`($ctx),%x#$T3 - vmovdqu `16*2-64`($ctx),%x#$T4 - vmovdqu `16*3-64`($ctx),%x#$D0 - vmovdqu `16*4-64`($ctx),%x#$D1 - vmovdqu `16*5-64`($ctx),%x#$D2 - vmovdqu `16*6-64`($ctx),%x#$D3 - vpermq \$0x15,$T2,$T2 # 00003412 -> 12343434 - vmovdqu `16*7-64`($ctx),%x#$D4 - vpermq \$0x15,$T3,$T3 - vpshufd \$0xc8,$T2,$T2 # 12343434 -> 14243444 - vmovdqu `16*8-64`($ctx),%x#$MASK - vpermq \$0x15,$T4,$T4 - vpshufd \$0xc8,$T3,$T3 - vmovdqa $T2,0x00(%rsp) - vpermq \$0x15,$D0,$D0 - vpshufd \$0xc8,$T4,$T4 - vmovdqa $T3,0x20(%rsp) - vpermq \$0x15,$D1,$D1 - vpshufd \$0xc8,$D0,$D0 - vmovdqa $T4,0x40(%rsp) - vpermq \$0x15,$D2,$D2 - vpshufd \$0xc8,$D1,$D1 - vmovdqa $D0,0x60(%rsp) - vpermq \$0x15,$D3,$D3 - vpshufd \$0xc8,$D2,$D2 - vmovdqa $D1,0x80(%rsp) - vpermq \$0x15,$D4,$D4 - vpshufd \$0xc8,$D3,$D3 - vmovdqa $D2,0xa0(%rsp) - vpermq \$0x15,$MASK,$MASK - vpshufd \$0xc8,$D4,$D4 - vmovdqa $D3,0xc0(%rsp) - vpshufd \$0xc8,$MASK,$MASK - vmovdqa $D4,0xe0(%rsp) - vmovdqa $MASK,0x100(%rsp) - vmovdqa 64(%rcx),$MASK # .Lmask26 - - ################################################################ - # load input - vmovdqu 16*0($inp),%x#$T0 - vmovdqu 16*1($inp),%x#$T1 - vinserti128 \$1,16*2($inp),$T0,$T0 - vinserti128 \$1,16*3($inp),$T1,$T1 - lea 16*4($inp),$inp - - vpsrldq \$6,$T0,$T2 # splat input - vpsrldq \$6,$T1,$T3 - vpunpckhqdq $T1,$T0,$T4 # 4 - vpunpcklqdq $T3,$T2,$T2 # 2:3 - vpunpcklqdq $T1,$T0,$T0 # 0:1 - - vpsrlq \$30,$T2,$T3 - vpsrlq \$4,$T2,$T2 - vpsrlq \$26,$T0,$T1 - vpsrlq \$40,$T4,$T4 # 4 - vpand $MASK,$T2,$T2 # 2 - vpand $MASK,$T0,$T0 # 0 - vpand $MASK,$T1,$T1 # 1 - vpand $MASK,$T3,$T3 # 3 - vpor 32(%rcx),$T4,$T4 # padbit, yes, always - - lea 0x90(%rsp),%rax # size optimization - vpaddq $H2,$T2,$H2 # accumulate input - sub \$64,$len - jz .Ltail_avx2 - jmp .Loop_avx2 - -.align 32 -.Loop_avx2: - ################################################################ - # ((inp[0]*r^4+r[4])*r^4+r[8])*r^4 - # ((inp[1]*r^4+r[5])*r^4+r[9])*r^3 - # ((inp[2]*r^4+r[6])*r^4+r[10])*r^2 - # ((inp[3]*r^4+r[7])*r^4+r[11])*r^1 - # \________/\________/ - ################################################################ - #vpaddq $H2,$T2,$H2 # accumulate input - vpaddq $H0,$T0,$H0 - vmovdqa `32*0`(%rsp),$T0 # r0^4 - vpaddq $H1,$T1,$H1 - vmovdqa `32*1`(%rsp),$T1 # r1^4 - vpaddq $H3,$T3,$H3 - vmovdqa `32*3`(%rsp),$T2 # r2^4 - vpaddq $H4,$T4,$H4 - vmovdqa `32*6-0x90`(%rax),$T3 # s3^4 - vmovdqa `32*8-0x90`(%rax),$S4 # s4^4 - - # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 - # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 - # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 - # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 - # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 - # - # however, as h2 is "chronologically" first one available pull - # corresponding operations up, so it's - # - # d4 = h2*r2 + h4*r0 + h3*r1 + h1*r3 + h0*r4 - # d3 = h2*r1 + h3*r0 + h1*r2 + h0*r3 + h4*5*r4 - # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 - # d1 = h2*5*r4 + h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 - # d0 = h2*5*r3 + h0*r0 + h4*5*r1 + h3*5*r2 + h1*5*r4 - - vpmuludq $H2,$T0,$D2 # d2 = h2*r0 - vpmuludq $H2,$T1,$D3 # d3 = h2*r1 - vpmuludq $H2,$T2,$D4 # d4 = h2*r2 - vpmuludq $H2,$T3,$D0 # d0 = h2*s3 - vpmuludq $H2,$S4,$D1 # d1 = h2*s4 - - vpmuludq $H0,$T1,$T4 # h0*r1 - vpmuludq $H1,$T1,$H2 # h1*r1, borrow $H2 as temp - vpaddq $T4,$D1,$D1 # d1 += h0*r1 - vpaddq $H2,$D2,$D2 # d2 += h1*r1 - vpmuludq $H3,$T1,$T4 # h3*r1 - vpmuludq `32*2`(%rsp),$H4,$H2 # h4*s1 - vpaddq $T4,$D4,$D4 # d4 += h3*r1 - vpaddq $H2,$D0,$D0 # d0 += h4*s1 - vmovdqa `32*4-0x90`(%rax),$T1 # s2 - - vpmuludq $H0,$T0,$T4 # h0*r0 - vpmuludq $H1,$T0,$H2 # h1*r0 - vpaddq $T4,$D0,$D0 # d0 += h0*r0 - vpaddq $H2,$D1,$D1 # d1 += h1*r0 - vpmuludq $H3,$T0,$T4 # h3*r0 - vpmuludq $H4,$T0,$H2 # h4*r0 - vmovdqu 16*0($inp),%x#$T0 # load input - vpaddq $T4,$D3,$D3 # d3 += h3*r0 - vpaddq $H2,$D4,$D4 # d4 += h4*r0 - vinserti128 \$1,16*2($inp),$T0,$T0 - - vpmuludq $H3,$T1,$T4 # h3*s2 - vpmuludq $H4,$T1,$H2 # h4*s2 - vmovdqu 16*1($inp),%x#$T1 - vpaddq $T4,$D0,$D0 # d0 += h3*s2 - vpaddq $H2,$D1,$D1 # d1 += h4*s2 - vmovdqa `32*5-0x90`(%rax),$H2 # r3 - vpmuludq $H1,$T2,$T4 # h1*r2 - vpmuludq $H0,$T2,$T2 # h0*r2 - vpaddq $T4,$D3,$D3 # d3 += h1*r2 - vpaddq $T2,$D2,$D2 # d2 += h0*r2 - vinserti128 \$1,16*3($inp),$T1,$T1 - lea 16*4($inp),$inp - - vpmuludq $H1,$H2,$T4 # h1*r3 - vpmuludq $H0,$H2,$H2 # h0*r3 - vpsrldq \$6,$T0,$T2 # splat input - vpaddq $T4,$D4,$D4 # d4 += h1*r3 - vpaddq $H2,$D3,$D3 # d3 += h0*r3 - vpmuludq $H3,$T3,$T4 # h3*s3 - vpmuludq $H4,$T3,$H2 # h4*s3 - vpsrldq \$6,$T1,$T3 - vpaddq $T4,$D1,$D1 # d1 += h3*s3 - vpaddq $H2,$D2,$D2 # d2 += h4*s3 - vpunpckhqdq $T1,$T0,$T4 # 4 - - vpmuludq $H3,$S4,$H3 # h3*s4 - vpmuludq $H4,$S4,$H4 # h4*s4 - vpunpcklqdq $T1,$T0,$T0 # 0:1 - vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4 - vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4 - vpunpcklqdq $T3,$T2,$T3 # 2:3 - vpmuludq `32*7-0x90`(%rax),$H0,$H4 # h0*r4 - vpmuludq $H1,$S4,$H0 # h1*s4 - vmovdqa 64(%rcx),$MASK # .Lmask26 - vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4 - vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4 - - ################################################################ - # lazy reduction (interleaved with tail of input splat) - - vpsrlq \$26,$H3,$D3 - vpand $MASK,$H3,$H3 - vpaddq $D3,$H4,$H4 # h3 -> h4 - - vpsrlq \$26,$H0,$D0 - vpand $MASK,$H0,$H0 - vpaddq $D0,$D1,$H1 # h0 -> h1 - - vpsrlq \$26,$H4,$D4 - vpand $MASK,$H4,$H4 - - vpsrlq \$4,$T3,$T2 - - vpsrlq \$26,$H1,$D1 - vpand $MASK,$H1,$H1 - vpaddq $D1,$H2,$H2 # h1 -> h2 - - vpaddq $D4,$H0,$H0 - vpsllq \$2,$D4,$D4 - vpaddq $D4,$H0,$H0 # h4 -> h0 - - vpand $MASK,$T2,$T2 # 2 - vpsrlq \$26,$T0,$T1 - - vpsrlq \$26,$H2,$D2 - vpand $MASK,$H2,$H2 - vpaddq $D2,$H3,$H3 # h2 -> h3 - - vpaddq $T2,$H2,$H2 # modulo-scheduled - vpsrlq \$30,$T3,$T3 - - vpsrlq \$26,$H0,$D0 - vpand $MASK,$H0,$H0 - vpaddq $D0,$H1,$H1 # h0 -> h1 - - vpsrlq \$40,$T4,$T4 # 4 - - vpsrlq \$26,$H3,$D3 - vpand $MASK,$H3,$H3 - vpaddq $D3,$H4,$H4 # h3 -> h4 - - vpand $MASK,$T0,$T0 # 0 - vpand $MASK,$T1,$T1 # 1 - vpand $MASK,$T3,$T3 # 3 - vpor 32(%rcx),$T4,$T4 # padbit, yes, always - - sub \$64,$len - jnz .Loop_avx2 - - .byte 0x66,0x90 -.Ltail_avx2: - ################################################################ - # while above multiplications were by r^4 in all lanes, in last - # iteration we multiply least significant lane by r^4 and most - # significant one by r, so copy of above except that references - # to the precomputed table are displaced by 4... - - #vpaddq $H2,$T2,$H2 # accumulate input - vpaddq $H0,$T0,$H0 - vmovdqu `32*0+4`(%rsp),$T0 # r0^4 - vpaddq $H1,$T1,$H1 - vmovdqu `32*1+4`(%rsp),$T1 # r1^4 - vpaddq $H3,$T3,$H3 - vmovdqu `32*3+4`(%rsp),$T2 # r2^4 - vpaddq $H4,$T4,$H4 - vmovdqu `32*6+4-0x90`(%rax),$T3 # s3^4 - vmovdqu `32*8+4-0x90`(%rax),$S4 # s4^4 - - vpmuludq $H2,$T0,$D2 # d2 = h2*r0 - vpmuludq $H2,$T1,$D3 # d3 = h2*r1 - vpmuludq $H2,$T2,$D4 # d4 = h2*r2 - vpmuludq $H2,$T3,$D0 # d0 = h2*s3 - vpmuludq $H2,$S4,$D1 # d1 = h2*s4 - - vpmuludq $H0,$T1,$T4 # h0*r1 - vpmuludq $H1,$T1,$H2 # h1*r1 - vpaddq $T4,$D1,$D1 # d1 += h0*r1 - vpaddq $H2,$D2,$D2 # d2 += h1*r1 - vpmuludq $H3,$T1,$T4 # h3*r1 - vpmuludq `32*2+4`(%rsp),$H4,$H2 # h4*s1 - vpaddq $T4,$D4,$D4 # d4 += h3*r1 - vpaddq $H2,$D0,$D0 # d0 += h4*s1 - - vpmuludq $H0,$T0,$T4 # h0*r0 - vpmuludq $H1,$T0,$H2 # h1*r0 - vpaddq $T4,$D0,$D0 # d0 += h0*r0 - vmovdqu `32*4+4-0x90`(%rax),$T1 # s2 - vpaddq $H2,$D1,$D1 # d1 += h1*r0 - vpmuludq $H3,$T0,$T4 # h3*r0 - vpmuludq $H4,$T0,$H2 # h4*r0 - vpaddq $T4,$D3,$D3 # d3 += h3*r0 - vpaddq $H2,$D4,$D4 # d4 += h4*r0 - - vpmuludq $H3,$T1,$T4 # h3*s2 - vpmuludq $H4,$T1,$H2 # h4*s2 - vpaddq $T4,$D0,$D0 # d0 += h3*s2 - vpaddq $H2,$D1,$D1 # d1 += h4*s2 - vmovdqu `32*5+4-0x90`(%rax),$H2 # r3 - vpmuludq $H1,$T2,$T4 # h1*r2 - vpmuludq $H0,$T2,$T2 # h0*r2 - vpaddq $T4,$D3,$D3 # d3 += h1*r2 - vpaddq $T2,$D2,$D2 # d2 += h0*r2 - - vpmuludq $H1,$H2,$T4 # h1*r3 - vpmuludq $H0,$H2,$H2 # h0*r3 - vpaddq $T4,$D4,$D4 # d4 += h1*r3 - vpaddq $H2,$D3,$D3 # d3 += h0*r3 - vpmuludq $H3,$T3,$T4 # h3*s3 - vpmuludq $H4,$T3,$H2 # h4*s3 - vpaddq $T4,$D1,$D1 # d1 += h3*s3 - vpaddq $H2,$D2,$D2 # d2 += h4*s3 - - vpmuludq $H3,$S4,$H3 # h3*s4 - vpmuludq $H4,$S4,$H4 # h4*s4 - vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4 - vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4 - vpmuludq `32*7+4-0x90`(%rax),$H0,$H4 # h0*r4 - vpmuludq $H1,$S4,$H0 # h1*s4 - vmovdqa 64(%rcx),$MASK # .Lmask26 - vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4 - vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4 - - ################################################################ - # horizontal addition - - vpsrldq \$8,$D1,$T1 - vpsrldq \$8,$H2,$T2 - vpsrldq \$8,$H3,$T3 - vpsrldq \$8,$H4,$T4 - vpsrldq \$8,$H0,$T0 - vpaddq $T1,$D1,$D1 - vpaddq $T2,$H2,$H2 - vpaddq $T3,$H3,$H3 - vpaddq $T4,$H4,$H4 - vpaddq $T0,$H0,$H0 - - vpermq \$0x2,$H3,$T3 - vpermq \$0x2,$H4,$T4 - vpermq \$0x2,$H0,$T0 - vpermq \$0x2,$D1,$T1 - vpermq \$0x2,$H2,$T2 - vpaddq $T3,$H3,$H3 - vpaddq $T4,$H4,$H4 - vpaddq $T0,$H0,$H0 - vpaddq $T1,$D1,$D1 - vpaddq $T2,$H2,$H2 - - ################################################################ - # lazy reduction - - vpsrlq \$26,$H3,$D3 - vpand $MASK,$H3,$H3 - vpaddq $D3,$H4,$H4 # h3 -> h4 - - vpsrlq \$26,$H0,$D0 - vpand $MASK,$H0,$H0 - vpaddq $D0,$D1,$H1 # h0 -> h1 - - vpsrlq \$26,$H4,$D4 - vpand $MASK,$H4,$H4 - - vpsrlq \$26,$H1,$D1 - vpand $MASK,$H1,$H1 - vpaddq $D1,$H2,$H2 # h1 -> h2 - - vpaddq $D4,$H0,$H0 - vpsllq \$2,$D4,$D4 - vpaddq $D4,$H0,$H0 # h4 -> h0 - - vpsrlq \$26,$H2,$D2 - vpand $MASK,$H2,$H2 - vpaddq $D2,$H3,$H3 # h2 -> h3 - - vpsrlq \$26,$H0,$D0 - vpand $MASK,$H0,$H0 - vpaddq $D0,$H1,$H1 # h0 -> h1 - - vpsrlq \$26,$H3,$D3 - vpand $MASK,$H3,$H3 - vpaddq $D3,$H4,$H4 # h3 -> h4 - - vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced - vmovd %x#$H1,`4*1-48-64`($ctx) - vmovd %x#$H2,`4*2-48-64`($ctx) - vmovd %x#$H3,`4*3-48-64`($ctx) - vmovd %x#$H4,`4*4-48-64`($ctx) -___ -$code.=<<___ if ($win64); - vmovdqa 0x50(%r11),%xmm6 - vmovdqa 0x60(%r11),%xmm7 - vmovdqa 0x70(%r11),%xmm8 - vmovdqa 0x80(%r11),%xmm9 - vmovdqa 0x90(%r11),%xmm10 - vmovdqa 0xa0(%r11),%xmm11 - vmovdqa 0xb0(%r11),%xmm12 - vmovdqa 0xc0(%r11),%xmm13 - vmovdqa 0xd0(%r11),%xmm14 - vmovdqa 0xe0(%r11),%xmm15 - lea 0xf8(%r11),%rsp -.Ldo_avx2_epilogue: -___ -$code.=<<___ if (!$win64); - lea 8(%r11),%rsp -___ -$code.=<<___; - vzeroupper - ret -.size poly1305_blocks_avx2,.-poly1305_blocks_avx2 -___ -} -$code.=<<___; -.align 64 -.Lconst: -.Lmask24: -.long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0 -.L129: -.long `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0 -.Lmask26: -.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0 -.Lfive: -.long 5,0,5,0,5,0,5,0 -___ -} - -$code.=<<___; -.asciz "Poly1305 for x86_64, CRYPTOGAMS by " -.align 16 -___ - -# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, -# CONTEXT *context,DISPATCHER_CONTEXT *disp) -if ($win64) { -$rec="%rcx"; -$frame="%rdx"; -$context="%r8"; -$disp="%r9"; - -$code.=<<___; -.extern __imp_RtlVirtualUnwind -.type se_handler,\@abi-omnipotent -.align 16 -se_handler: - push %rsi - push %rdi - push %rbx - push %rbp - push %r12 - push %r13 - push %r14 - push %r15 - pushfq - sub \$64,%rsp - - mov 120($context),%rax # pull context->Rax - mov 248($context),%rbx # pull context->Rip - - mov 8($disp),%rsi # disp->ImageBase - mov 56($disp),%r11 # disp->HandlerData - - mov 0(%r11),%r10d # HandlerData[0] - lea (%rsi,%r10),%r10 # prologue label - cmp %r10,%rbx # context->Rip<.Lprologue - jb .Lcommon_seh_tail - - mov 152($context),%rax # pull context->Rsp - - mov 4(%r11),%r10d # HandlerData[1] - lea (%rsi,%r10),%r10 # epilogue label - cmp %r10,%rbx # context->Rip>=.Lepilogue - jae .Lcommon_seh_tail - - lea 48(%rax),%rax - - mov -8(%rax),%rbx - mov -16(%rax),%rbp - mov -24(%rax),%r12 - mov -32(%rax),%r13 - mov -40(%rax),%r14 - mov -48(%rax),%r15 - mov %rbx,144($context) # restore context->Rbx - mov %rbp,160($context) # restore context->Rbp - mov %r12,216($context) # restore context->R12 - mov %r13,224($context) # restore context->R13 - mov %r14,232($context) # restore context->R14 - mov %r15,240($context) # restore context->R14 - - jmp .Lcommon_seh_tail -.size se_handler,.-se_handler - -.type avx_handler,\@abi-omnipotent -.align 16 -avx_handler: - push %rsi - push %rdi - push %rbx - push %rbp - push %r12 - push %r13 - push %r14 - push %r15 - pushfq - sub \$64,%rsp - - mov 120($context),%rax # pull context->Rax - mov 248($context),%rbx # pull context->Rip - - mov 8($disp),%rsi # disp->ImageBase - mov 56($disp),%r11 # disp->HandlerData - - mov 0(%r11),%r10d # HandlerData[0] - lea (%rsi,%r10),%r10 # prologue label - cmp %r10,%rbx # context->RipRsp - - mov 4(%r11),%r10d # HandlerData[1] - lea (%rsi,%r10),%r10 # epilogue label - cmp %r10,%rbx # context->Rip>=epilogue label - jae .Lcommon_seh_tail - - mov 208($context),%rax # pull context->R11 - - lea 0x50(%rax),%rsi - lea 0xf8(%rax),%rax - lea 512($context),%rdi # &context.Xmm6 - mov \$20,%ecx - .long 0xa548f3fc # cld; rep movsq - -.Lcommon_seh_tail: - mov 8(%rax),%rdi - mov 16(%rax),%rsi - mov %rax,152($context) # restore context->Rsp - mov %rsi,168($context) # restore context->Rsi - mov %rdi,176($context) # restore context->Rdi - - mov 40($disp),%rdi # disp->ContextRecord - mov $context,%rsi # context - mov \$154,%ecx # sizeof(CONTEXT) - .long 0xa548f3fc # cld; rep movsq - - mov $disp,%rsi - xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER - mov 8(%rsi),%rdx # arg2, disp->ImageBase - mov 0(%rsi),%r8 # arg3, disp->ControlPc - mov 16(%rsi),%r9 # arg4, disp->FunctionEntry - mov 40(%rsi),%r10 # disp->ContextRecord - lea 56(%rsi),%r11 # &disp->HandlerData - lea 24(%rsi),%r12 # &disp->EstablisherFrame - mov %r10,32(%rsp) # arg5 - mov %r11,40(%rsp) # arg6 - mov %r12,48(%rsp) # arg7 - mov %rcx,56(%rsp) # arg8, (NULL) - call *__imp_RtlVirtualUnwind(%rip) - - mov \$1,%eax # ExceptionContinueSearch - add \$64,%rsp - popfq - pop %r15 - pop %r14 - pop %r13 - pop %r12 - pop %rbp - pop %rbx - pop %rdi - pop %rsi - ret -.size avx_handler,.-avx_handler - -.section .pdata -.align 4 - .rva .LSEH_begin_GFp_poly1305_init_asm - .rva .LSEH_end_GFp_poly1305_init_asm - .rva .LSEH_info_GFp_poly1305_init_asm - - .rva .LSEH_begin_GFp_poly1305_blocks - .rva .LSEH_end_GFp_poly1305_blocks - .rva .LSEH_info_GFp_poly1305_blocks - - .rva .LSEH_begin_GFp_poly1305_emit - .rva .LSEH_end_GFp_poly1305_emit - .rva .LSEH_info_GFp_poly1305_emit -___ -$code.=<<___ if ($avx); - .rva .LSEH_begin_poly1305_blocks_avx - .rva .Lbase2_64_avx - .rva .LSEH_info_poly1305_blocks_avx_1 - - .rva .Lbase2_64_avx - .rva .Leven_avx - .rva .LSEH_info_poly1305_blocks_avx_2 - - .rva .Leven_avx - .rva .LSEH_end_poly1305_blocks_avx - .rva .LSEH_info_poly1305_blocks_avx_3 - - .rva .LSEH_begin_poly1305_emit_avx - .rva .LSEH_end_poly1305_emit_avx - .rva .LSEH_info_poly1305_emit_avx -___ -$code.=<<___ if ($avx>1); - .rva .LSEH_begin_poly1305_blocks_avx2 - .rva .Lbase2_64_avx2 - .rva .LSEH_info_poly1305_blocks_avx2_1 - - .rva .Lbase2_64_avx2 - .rva .Leven_avx2 - .rva .LSEH_info_poly1305_blocks_avx2_2 - - .rva .Leven_avx2 - .rva .LSEH_end_poly1305_blocks_avx2 - .rva .LSEH_info_poly1305_blocks_avx2_3 -___ -$code.=<<___; -.section .xdata -.align 8 -.LSEH_info_GFp_poly1305_init_asm: - .byte 9,0,0,0 - .rva se_handler - .rva .LSEH_begin_GFp_poly1305_init_asm,.LSEH_begin_GFp_poly1305_init_asm - -.LSEH_info_GFp_poly1305_blocks: - .byte 9,0,0,0 - .rva se_handler - .rva .Lblocks_body,.Lblocks_epilogue - -.LSEH_info_GFp_poly1305_emit: - .byte 9,0,0,0 - .rva se_handler - .rva .LSEH_begin_GFp_poly1305_emit,.LSEH_begin_GFp_poly1305_emit -___ -$code.=<<___ if ($avx); -.LSEH_info_poly1305_blocks_avx_1: - .byte 9,0,0,0 - .rva se_handler - .rva .Lblocks_avx_body,.Lblocks_avx_epilogue # HandlerData[] - -.LSEH_info_poly1305_blocks_avx_2: - .byte 9,0,0,0 - .rva se_handler - .rva .Lbase2_64_avx_body,.Lbase2_64_avx_epilogue # HandlerData[] - -.LSEH_info_poly1305_blocks_avx_3: - .byte 9,0,0,0 - .rva avx_handler - .rva .Ldo_avx_body,.Ldo_avx_epilogue # HandlerData[] - -.LSEH_info_poly1305_emit_avx: - .byte 9,0,0,0 - .rva se_handler - .rva .LSEH_begin_poly1305_emit_avx,.LSEH_begin_poly1305_emit_avx -___ -$code.=<<___ if ($avx>1); -.LSEH_info_poly1305_blocks_avx2_1: - .byte 9,0,0,0 - .rva se_handler - .rva .Lblocks_avx2_body,.Lblocks_avx2_epilogue # HandlerData[] - -.LSEH_info_poly1305_blocks_avx2_2: - .byte 9,0,0,0 - .rva se_handler - .rva .Lbase2_64_avx2_body,.Lbase2_64_avx2_epilogue # HandlerData[] - -.LSEH_info_poly1305_blocks_avx2_3: - .byte 9,0,0,0 - .rva avx_handler - .rva .Ldo_avx2_body,.Ldo_avx2_epilogue # HandlerData[] -___ -} - -foreach (split('\n',$code)) { - s/\`([^\`]*)\`/eval($1)/ge; - s/%r([a-z]+)#d/%e$1/g; - s/%r([0-9]+)#d/%r$1d/g; - s/%x#%y/%x/g; - - print $_,"\n"; -} -close STDOUT or die "error closing STDOUT"; diff --git a/crypto/poly1305/internal.h b/crypto/poly1305/internal.h new file mode 100644 index 0000000000..98e7a482d1 --- /dev/null +++ b/crypto/poly1305/internal.h @@ -0,0 +1,25 @@ +/* Copyright (c) 2016, Google Inc. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY + * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ + +#ifndef OPENSSL_HEADER_POLY1305_INTERNAL_H +#define OPENSSL_HEADER_POLY1305_INTERNAL_H + +#include +#include + +#if defined(OPENSSL_ARM) && !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_APPLE) +#define OPENSSL_POLY1305_NEON +#endif + +#endif // OPENSSL_HEADER_POLY1305_INTERNAL_H diff --git a/crypto/poly1305/poly1305.c b/crypto/poly1305/poly1305.c new file mode 100644 index 0000000000..66620580ae --- /dev/null +++ b/crypto/poly1305/poly1305.c @@ -0,0 +1,301 @@ +/* Copyright (c) 2014, Google Inc. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY + * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ + +// This implementation of poly1305 is by Andrew Moon +// (https://github.com/floodyberry/poly1305-donna) and released as public +// domain. + +#include + +#include "internal.h" +#include "../internal.h" + + +#if !defined(BORINGSSL_HAS_UINT128) || !defined(OPENSSL_X86_64) + +#if defined(__GNUC__) +#pragma GCC diagnostic ignored "-Wsign-conversion" +#pragma GCC diagnostic ignored "-Wconversion" +#endif + +// We can assume little-endian. +static uint32_t U8TO32_LE(const uint8_t *m) { + uint32_t r; + GFp_memcpy(&r, m, sizeof(r)); + return r; +} + +static void U32TO8_LE(uint8_t *m, uint32_t v) { + GFp_memcpy(m, &v, sizeof(v)); +} + +static uint64_t mul32x32_64(uint32_t a, uint32_t b) { return (uint64_t)a * b; } + +struct poly1305_state_st { + uint32_t r0, r1, r2, r3, r4; + uint32_t s1, s2, s3, s4; + uint32_t h0, h1, h2, h3, h4; + uint8_t buf[16]; + size_t buf_used; + uint8_t key[16]; +}; + +OPENSSL_STATIC_ASSERT(sizeof(struct poly1305_state_st) <= sizeof(poly1305_state), + "poly1305_state isn't large enough to hold aligned poly1305_state_st"); + +static inline struct poly1305_state_st *poly1305_aligned_state( + poly1305_state *state) { + dev_assert_secret(((uintptr_t)state & 63) == 0); + return (struct poly1305_state_st *)(((uintptr_t)state + 63) & ~63); +} + +// poly1305_blocks updates |state| given some amount of input data. This +// function may only be called with a |len| that is not a multiple of 16 at the +// end of the data. Otherwise the input must be buffered into 16 byte blocks. +static void poly1305_update(struct poly1305_state_st *state, const uint8_t *in, + size_t len) { + uint32_t t0, t1, t2, t3; + uint64_t t[5]; + uint32_t b; + uint64_t c; + size_t j; + uint8_t mp[16]; + + if (len < 16) { + goto poly1305_donna_atmost15bytes; + } + +poly1305_donna_16bytes: + t0 = U8TO32_LE(in); + t1 = U8TO32_LE(in + 4); + t2 = U8TO32_LE(in + 8); + t3 = U8TO32_LE(in + 12); + + in += 16; + len -= 16; + + state->h0 += t0 & 0x3ffffff; + state->h1 += ((((uint64_t)t1 << 32) | t0) >> 26) & 0x3ffffff; + state->h2 += ((((uint64_t)t2 << 32) | t1) >> 20) & 0x3ffffff; + state->h3 += ((((uint64_t)t3 << 32) | t2) >> 14) & 0x3ffffff; + state->h4 += (t3 >> 8) | (1 << 24); + +poly1305_donna_mul: + t[0] = mul32x32_64(state->h0, state->r0) + mul32x32_64(state->h1, state->s4) + + mul32x32_64(state->h2, state->s3) + mul32x32_64(state->h3, state->s2) + + mul32x32_64(state->h4, state->s1); + t[1] = mul32x32_64(state->h0, state->r1) + mul32x32_64(state->h1, state->r0) + + mul32x32_64(state->h2, state->s4) + mul32x32_64(state->h3, state->s3) + + mul32x32_64(state->h4, state->s2); + t[2] = mul32x32_64(state->h0, state->r2) + mul32x32_64(state->h1, state->r1) + + mul32x32_64(state->h2, state->r0) + mul32x32_64(state->h3, state->s4) + + mul32x32_64(state->h4, state->s3); + t[3] = mul32x32_64(state->h0, state->r3) + mul32x32_64(state->h1, state->r2) + + mul32x32_64(state->h2, state->r1) + mul32x32_64(state->h3, state->r0) + + mul32x32_64(state->h4, state->s4); + t[4] = mul32x32_64(state->h0, state->r4) + mul32x32_64(state->h1, state->r3) + + mul32x32_64(state->h2, state->r2) + mul32x32_64(state->h3, state->r1) + + mul32x32_64(state->h4, state->r0); + + state->h0 = (uint32_t)t[0] & 0x3ffffff; + c = (t[0] >> 26); + t[1] += c; + state->h1 = (uint32_t)t[1] & 0x3ffffff; + b = (uint32_t)(t[1] >> 26); + t[2] += b; + state->h2 = (uint32_t)t[2] & 0x3ffffff; + b = (uint32_t)(t[2] >> 26); + t[3] += b; + state->h3 = (uint32_t)t[3] & 0x3ffffff; + b = (uint32_t)(t[3] >> 26); + t[4] += b; + state->h4 = (uint32_t)t[4] & 0x3ffffff; + b = (uint32_t)(t[4] >> 26); + state->h0 += b * 5; + + if (len >= 16) { + goto poly1305_donna_16bytes; + } + +// final bytes +poly1305_donna_atmost15bytes: + if (!len) { + return; + } + + for (j = 0; j < len; j++) { + mp[j] = in[j]; + } + mp[j++] = 1; + for (; j < 16; j++) { + mp[j] = 0; + } + len = 0; + + t0 = U8TO32_LE(mp + 0); + t1 = U8TO32_LE(mp + 4); + t2 = U8TO32_LE(mp + 8); + t3 = U8TO32_LE(mp + 12); + + state->h0 += t0 & 0x3ffffff; + state->h1 += ((((uint64_t)t1 << 32) | t0) >> 26) & 0x3ffffff; + state->h2 += ((((uint64_t)t2 << 32) | t1) >> 20) & 0x3ffffff; + state->h3 += ((((uint64_t)t3 << 32) | t2) >> 14) & 0x3ffffff; + state->h4 += (t3 >> 8); + + goto poly1305_donna_mul; +} + +void GFp_poly1305_init(poly1305_state *statep, const uint8_t key[32]) { + struct poly1305_state_st *state = poly1305_aligned_state(statep); + uint32_t t0, t1, t2, t3; + + t0 = U8TO32_LE(key + 0); + t1 = U8TO32_LE(key + 4); + t2 = U8TO32_LE(key + 8); + t3 = U8TO32_LE(key + 12); + + // precompute multipliers + state->r0 = t0 & 0x3ffffff; + t0 >>= 26; + t0 |= t1 << 6; + state->r1 = t0 & 0x3ffff03; + t1 >>= 20; + t1 |= t2 << 12; + state->r2 = t1 & 0x3ffc0ff; + t2 >>= 14; + t2 |= t3 << 18; + state->r3 = t2 & 0x3f03fff; + t3 >>= 8; + state->r4 = t3 & 0x00fffff; + + state->s1 = state->r1 * 5; + state->s2 = state->r2 * 5; + state->s3 = state->r3 * 5; + state->s4 = state->r4 * 5; + + // init state + state->h0 = 0; + state->h1 = 0; + state->h2 = 0; + state->h3 = 0; + state->h4 = 0; + + state->buf_used = 0; + GFp_memcpy(state->key, key + 16, sizeof(state->key)); +} + +void GFp_poly1305_update(poly1305_state *statep, const uint8_t *in, + size_t in_len) { + struct poly1305_state_st *state = poly1305_aligned_state(statep); + + if (state->buf_used) { + size_t todo = 16 - state->buf_used; + if (todo > in_len) { + todo = in_len; + } + for (size_t i = 0; i < todo; i++) { + state->buf[state->buf_used + i] = in[i]; + } + state->buf_used += todo; + in_len -= todo; + in += todo; + + if (state->buf_used == 16) { + poly1305_update(state, state->buf, 16); + state->buf_used = 0; + } + } + + if (in_len >= 16) { + size_t todo = in_len & ~0xf; + poly1305_update(state, in, todo); + in += todo; + in_len &= 0xf; + } + + if (in_len) { + for (size_t i = 0; i < in_len; i++) { + state->buf[i] = in[i]; + } + state->buf_used = in_len; + } +} + +void GFp_poly1305_finish(poly1305_state *statep, uint8_t mac[16]) { + struct poly1305_state_st *state = poly1305_aligned_state(statep); + uint64_t f0, f1, f2, f3; + uint32_t g0, g1, g2, g3, g4; + uint32_t b, nb; + + if (state->buf_used) { + poly1305_update(state, state->buf, state->buf_used); + } + + b = state->h0 >> 26; + state->h0 = state->h0 & 0x3ffffff; + state->h1 += b; + b = state->h1 >> 26; + state->h1 = state->h1 & 0x3ffffff; + state->h2 += b; + b = state->h2 >> 26; + state->h2 = state->h2 & 0x3ffffff; + state->h3 += b; + b = state->h3 >> 26; + state->h3 = state->h3 & 0x3ffffff; + state->h4 += b; + b = state->h4 >> 26; + state->h4 = state->h4 & 0x3ffffff; + state->h0 += b * 5; + + g0 = state->h0 + 5; + b = g0 >> 26; + g0 &= 0x3ffffff; + g1 = state->h1 + b; + b = g1 >> 26; + g1 &= 0x3ffffff; + g2 = state->h2 + b; + b = g2 >> 26; + g2 &= 0x3ffffff; + g3 = state->h3 + b; + b = g3 >> 26; + g3 &= 0x3ffffff; + g4 = state->h4 + b - (1 << 26); + + b = (g4 >> 31) - 1; + nb = ~b; + state->h0 = (state->h0 & nb) | (g0 & b); + state->h1 = (state->h1 & nb) | (g1 & b); + state->h2 = (state->h2 & nb) | (g2 & b); + state->h3 = (state->h3 & nb) | (g3 & b); + state->h4 = (state->h4 & nb) | (g4 & b); + + f0 = ((state->h0) | (state->h1 << 26)) + (uint64_t)U8TO32_LE(&state->key[0]); + f1 = ((state->h1 >> 6) | (state->h2 << 20)) + + (uint64_t)U8TO32_LE(&state->key[4]); + f2 = ((state->h2 >> 12) | (state->h3 << 14)) + + (uint64_t)U8TO32_LE(&state->key[8]); + f3 = ((state->h3 >> 18) | (state->h4 << 8)) + + (uint64_t)U8TO32_LE(&state->key[12]); + + U32TO8_LE(&mac[0], (uint32_t)f0); + f1 += (f0 >> 32); + U32TO8_LE(&mac[4], (uint32_t)f1); + f2 += (f1 >> 32); + U32TO8_LE(&mac[8], (uint32_t)f2); + f3 += (f2 >> 32); + U32TO8_LE(&mac[12], (uint32_t)f3); +} + +#endif // !BORINGSSL_HAS_UINT128 || !OPENSSL_X86_64 diff --git a/crypto/poly1305/poly1305_arm.c b/crypto/poly1305/poly1305_arm.c new file mode 100644 index 0000000000..3b00a9f2f3 --- /dev/null +++ b/crypto/poly1305/poly1305_arm.c @@ -0,0 +1,307 @@ +/* Copyright (c) 2014, Google Inc. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY + * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ + +// This implementation was taken from the public domain, neon2 version in +// SUPERCOP by D. J. Bernstein and Peter Schwabe. + +#include + +#include "internal.h" +#include "../internal.h" + + +#if defined(OPENSSL_POLY1305_NEON) + +#pragma GCC diagnostic ignored "-Wsign-conversion" +#pragma GCC diagnostic ignored "-Wcast-align" + +typedef struct { + uint32_t v[12]; // for alignment; only using 10 +} fe1305x2; + +#define addmulmod GFp_poly1305_neon2_addmulmod +#define blocks GFp_poly1305_neon2_blocks + +extern void addmulmod(fe1305x2 *r, const fe1305x2 *x, const fe1305x2 *y, + const fe1305x2 *c); + +extern int blocks(fe1305x2 *h, const fe1305x2 *precomp, const uint8_t *in, + size_t inlen); + +static void freeze(fe1305x2 *r) { + int i; + + uint32_t x0 = r->v[0]; + uint32_t x1 = r->v[2]; + uint32_t x2 = r->v[4]; + uint32_t x3 = r->v[6]; + uint32_t x4 = r->v[8]; + uint32_t y0; + uint32_t y1; + uint32_t y2; + uint32_t y3; + uint32_t y4; + uint32_t swap; + + for (i = 0; i < 3; ++i) { + x1 += x0 >> 26; + x0 &= 0x3ffffff; + x2 += x1 >> 26; + x1 &= 0x3ffffff; + x3 += x2 >> 26; + x2 &= 0x3ffffff; + x4 += x3 >> 26; + x3 &= 0x3ffffff; + x0 += 5 * (x4 >> 26); + x4 &= 0x3ffffff; + } + + y0 = x0 + 5; + y1 = x1 + (y0 >> 26); + y0 &= 0x3ffffff; + y2 = x2 + (y1 >> 26); + y1 &= 0x3ffffff; + y3 = x3 + (y2 >> 26); + y2 &= 0x3ffffff; + y4 = x4 + (y3 >> 26); + y3 &= 0x3ffffff; + swap = -(y4 >> 26); + y4 &= 0x3ffffff; + + y0 ^= x0; + y1 ^= x1; + y2 ^= x2; + y3 ^= x3; + y4 ^= x4; + + y0 &= swap; + y1 &= swap; + y2 &= swap; + y3 &= swap; + y4 &= swap; + + y0 ^= x0; + y1 ^= x1; + y2 ^= x2; + y3 ^= x3; + y4 ^= x4; + + r->v[0] = y0; + r->v[2] = y1; + r->v[4] = y2; + r->v[6] = y3; + r->v[8] = y4; +} + +static void store32(uint8_t out[4], uint32_t v) { GFp_memcpy(out, &v, 4); } + +// load32 exists to avoid breaking strict aliasing rules in +// fe1305x2_frombytearray. +static uint32_t load32(const uint8_t t[4]) { + uint32_t tmp; + GFp_memcpy(&tmp, t, sizeof(tmp)); + return tmp; +} + +static void fe1305x2_tobytearray(uint8_t r[16], fe1305x2 *x) { + uint32_t x0 = x->v[0]; + uint32_t x1 = x->v[2]; + uint32_t x2 = x->v[4]; + uint32_t x3 = x->v[6]; + uint32_t x4 = x->v[8]; + + x1 += x0 >> 26; + x0 &= 0x3ffffff; + x2 += x1 >> 26; + x1 &= 0x3ffffff; + x3 += x2 >> 26; + x2 &= 0x3ffffff; + x4 += x3 >> 26; + x3 &= 0x3ffffff; + + store32(r, x0 + (x1 << 26)); + store32(r + 4, (x1 >> 6) + (x2 << 20)); + store32(r + 8, (x2 >> 12) + (x3 << 14)); + store32(r + 12, (x3 >> 18) + (x4 << 8)); +} + +static void fe1305x2_frombytearray(fe1305x2 *r, const uint8_t *x, size_t xlen) { + size_t i; + uint8_t t[17]; + + for (i = 0; (i < 16) && (i < xlen); i++) { + t[i] = x[i]; + } + xlen -= i; + x += i; + t[i++] = 1; + for (; i < 17; i++) { + t[i] = 0; + } + + r->v[0] = 0x3ffffff & load32(t); + r->v[2] = 0x3ffffff & (load32(t + 3) >> 2); + r->v[4] = 0x3ffffff & (load32(t + 6) >> 4); + r->v[6] = 0x3ffffff & (load32(t + 9) >> 6); + r->v[8] = load32(t + 13); + + if (xlen) { + for (i = 0; (i < 16) && (i < xlen); i++) { + t[i] = x[i]; + } + t[i++] = 1; + for (; i < 17; i++) { + t[i] = 0; + } + + r->v[1] = 0x3ffffff & load32(t); + r->v[3] = 0x3ffffff & (load32(t + 3) >> 2); + r->v[5] = 0x3ffffff & (load32(t + 6) >> 4); + r->v[7] = 0x3ffffff & (load32(t + 9) >> 6); + r->v[9] = load32(t + 13); + } else { + r->v[1] = r->v[3] = r->v[5] = r->v[7] = r->v[9] = 0; + } +} + +static const alignas(16) fe1305x2 zero; + +struct poly1305_state_st { + uint8_t data[sizeof(fe1305x2[5]) + 128]; + uint8_t buf[32]; + size_t buf_used; + uint8_t key[16]; +}; + +OPENSSL_STATIC_ASSERT(sizeof(struct poly1305_state_st) <= sizeof(poly1305_state), + "poly1305_state isn't large enough to hold aligned poly1305_state_st"); + +void GFp_poly1305_init_neon(poly1305_state *state, const uint8_t key[32]) { + struct poly1305_state_st *st = (struct poly1305_state_st *)(state); + fe1305x2 *const r = (fe1305x2 *)(st->data + (15 & (-(int)st->data))); + fe1305x2 *const h = r + 1; + fe1305x2 *const c = h + 1; + fe1305x2 *const precomp = c + 1; + + r->v[1] = r->v[0] = 0x3ffffff & load32(key); + r->v[3] = r->v[2] = 0x3ffff03 & (load32(key + 3) >> 2); + r->v[5] = r->v[4] = 0x3ffc0ff & (load32(key + 6) >> 4); + r->v[7] = r->v[6] = 0x3f03fff & (load32(key + 9) >> 6); + r->v[9] = r->v[8] = 0x00fffff & (load32(key + 12) >> 8); + + for (size_t j = 0; j < 10; j++) { + h->v[j] = 0; // XXX: should fast-forward a bit + } + + addmulmod(precomp, r, r, &zero); // precompute r^2 + addmulmod(precomp + 1, precomp, precomp, &zero); // precompute r^4 + + GFp_memcpy(st->key, key + 16, 16); + st->buf_used = 0; +} + +void GFp_poly1305_update_neon(poly1305_state *state, const uint8_t *in, + size_t in_len) { + struct poly1305_state_st *st = (struct poly1305_state_st *)(state); + fe1305x2 *const r = (fe1305x2 *)(st->data + (15 & (-(int)st->data))); + fe1305x2 *const h = r + 1; + fe1305x2 *const c = h + 1; + fe1305x2 *const precomp = c + 1; + + if (st->buf_used) { + size_t todo = 32 - st->buf_used; + if (todo > in_len) { + todo = in_len; + } + for (size_t i = 0; i < todo; i++) { + st->buf[st->buf_used + i] = in[i]; + } + st->buf_used += todo; + in_len -= todo; + in += todo; + + if (st->buf_used == sizeof(st->buf) && in_len) { + addmulmod(h, h, precomp, &zero); + fe1305x2_frombytearray(c, st->buf, sizeof(st->buf)); + for (size_t i = 0; i < 10; i++) { + h->v[i] += c->v[i]; + } + st->buf_used = 0; + } + } + + while (in_len > 32) { + size_t tlen = 1048576; + if (in_len < tlen) { + tlen = in_len; + } + tlen -= blocks(h, precomp, in, tlen); + in_len -= tlen; + in += tlen; + } + + if (in_len) { + for (size_t i = 0; i < in_len; i++) { + st->buf[i] = in[i]; + } + st->buf_used = in_len; + } +} + +void GFp_poly1305_finish_neon(poly1305_state *state, uint8_t mac[16]) { + struct poly1305_state_st *st = (struct poly1305_state_st *)(state); + fe1305x2 *const r = (fe1305x2 *)(st->data + (15 & (-(int)st->data))); + fe1305x2 *const h = r + 1; + fe1305x2 *const c = h + 1; + fe1305x2 *const precomp = c + 1; + + addmulmod(h, h, precomp, &zero); + + if (st->buf_used > 16) { + fe1305x2_frombytearray(c, st->buf, st->buf_used); + precomp->v[1] = r->v[1]; + precomp->v[3] = r->v[3]; + precomp->v[5] = r->v[5]; + precomp->v[7] = r->v[7]; + precomp->v[9] = r->v[9]; + addmulmod(h, h, precomp, c); + } else if (st->buf_used > 0) { + fe1305x2_frombytearray(c, st->buf, st->buf_used); + r->v[1] = 1; + r->v[3] = 0; + r->v[5] = 0; + r->v[7] = 0; + r->v[9] = 0; + addmulmod(h, h, r, c); + } + + h->v[0] += h->v[1]; + h->v[2] += h->v[3]; + h->v[4] += h->v[5]; + h->v[6] += h->v[7]; + h->v[8] += h->v[9]; + freeze(h); + + fe1305x2_frombytearray(c, st->key, 16); + c->v[8] ^= (1 << 24); + + h->v[0] += c->v[0]; + h->v[2] += c->v[2]; + h->v[4] += c->v[4]; + h->v[6] += c->v[6]; + h->v[8] += c->v[8]; + fe1305x2_tobytearray(mac, h); +} + +#endif // OPENSSL_POLY1305_NEON diff --git a/crypto/poly1305/poly1305_arm_asm.S b/crypto/poly1305/poly1305_arm_asm.S new file mode 100644 index 0000000000..24ae435fdd --- /dev/null +++ b/crypto/poly1305/poly1305_arm_asm.S @@ -0,0 +1,2031 @@ +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + +#if defined(__arm__) && !defined(OPENSSL_NO_ASM) && !defined(__APPLE__) + +#pragma GCC diagnostic ignored "-Wlanguage-extension-token" + +#if defined(BORINGSSL_PREFIX) +#include +#endif + +# This implementation was taken from the public domain, neon2 version in +# SUPERCOP by D. J. Bernstein and Peter Schwabe. + +# qhasm: int32 input_0 + +# qhasm: int32 input_1 + +# qhasm: int32 input_2 + +# qhasm: int32 input_3 + +# qhasm: stack32 input_4 + +# qhasm: stack32 input_5 + +# qhasm: stack32 input_6 + +# qhasm: stack32 input_7 + +# qhasm: int32 caller_r4 + +# qhasm: int32 caller_r5 + +# qhasm: int32 caller_r6 + +# qhasm: int32 caller_r7 + +# qhasm: int32 caller_r8 + +# qhasm: int32 caller_r9 + +# qhasm: int32 caller_r10 + +# qhasm: int32 caller_r11 + +# qhasm: int32 caller_r12 + +# qhasm: int32 caller_r14 + +# qhasm: reg128 caller_q4 + +# qhasm: reg128 caller_q5 + +# qhasm: reg128 caller_q6 + +# qhasm: reg128 caller_q7 + +# qhasm: startcode +.fpu neon +.text + +# qhasm: reg128 r0 + +# qhasm: reg128 r1 + +# qhasm: reg128 r2 + +# qhasm: reg128 r3 + +# qhasm: reg128 r4 + +# qhasm: reg128 x01 + +# qhasm: reg128 x23 + +# qhasm: reg128 x4 + +# qhasm: reg128 y0 + +# qhasm: reg128 y12 + +# qhasm: reg128 y34 + +# qhasm: reg128 5y12 + +# qhasm: reg128 5y34 + +# qhasm: stack128 y0_stack + +# qhasm: stack128 y12_stack + +# qhasm: stack128 y34_stack + +# qhasm: stack128 5y12_stack + +# qhasm: stack128 5y34_stack + +# qhasm: reg128 z0 + +# qhasm: reg128 z12 + +# qhasm: reg128 z34 + +# qhasm: reg128 5z12 + +# qhasm: reg128 5z34 + +# qhasm: stack128 z0_stack + +# qhasm: stack128 z12_stack + +# qhasm: stack128 z34_stack + +# qhasm: stack128 5z12_stack + +# qhasm: stack128 5z34_stack + +# qhasm: stack128 two24 + +# qhasm: int32 ptr + +# qhasm: reg128 c01 + +# qhasm: reg128 c23 + +# qhasm: reg128 d01 + +# qhasm: reg128 d23 + +# qhasm: reg128 t0 + +# qhasm: reg128 t1 + +# qhasm: reg128 t2 + +# qhasm: reg128 t3 + +# qhasm: reg128 t4 + +# qhasm: reg128 mask + +# qhasm: reg128 u0 + +# qhasm: reg128 u1 + +# qhasm: reg128 u2 + +# qhasm: reg128 u3 + +# qhasm: reg128 u4 + +# qhasm: reg128 v01 + +# qhasm: reg128 mid + +# qhasm: reg128 v23 + +# qhasm: reg128 v4 + +# qhasm: int32 len + +# qhasm: qpushenter crypto_onetimeauth_poly1305_neon2_blocks +.align 4 +.global GFp_poly1305_neon2_blocks +.hidden GFp_poly1305_neon2_blocks +.type GFp_poly1305_neon2_blocks STT_FUNC +GFp_poly1305_neon2_blocks: +vpush {q4,q5,q6,q7} +mov r12,sp +sub sp,sp,#192 +bic sp,sp,#31 + +# qhasm: len = input_3 +# asm 1: mov >len=int32#4,len=r3,y12=reg128#2%bot->y12=reg128#2%top},[y12=d2->y12=d3},[y34=reg128#3%bot->y34=reg128#3%top},[y34=d4->y34=d5},[input_1=int32#2,input_1=r1,z12=reg128#5%bot->z12=reg128#5%top},[z12=d8->z12=d9},[z34=reg128#6%bot->z34=reg128#6%top},[z34=d10->z34=d11},[mask=reg128#7,#0xffffffff +# asm 2: vmov.i64 >mask=q6,#0xffffffff +vmov.i64 q6,#0xffffffff + +# qhasm: 2x u4 = 0xff +# asm 1: vmov.i64 >u4=reg128#8,#0xff +# asm 2: vmov.i64 >u4=q7,#0xff +vmov.i64 q7,#0xff + +# qhasm: x01 aligned= mem128[input_0];input_0+=16 +# asm 1: vld1.8 {>x01=reg128#9%bot->x01=reg128#9%top},[x01=d16->x01=d17},[x23=reg128#10%bot->x23=reg128#10%top},[x23=d18->x23=d19},[input_0=int32#1,input_0=r0,>=6 +# asm 1: vshr.u64 >mask=reg128#7,mask=q6,>= 7 +# asm 1: vshr.u64 >u4=reg128#8,u4=q7,5y12=reg128#12,5y12=q11,5y34=reg128#13,5y34=q12,5y12=reg128#12,<5y12=reg128#12,5y12=q11,<5y12=q11,5y34=reg128#13,<5y34=reg128#13,5y34=q12,<5y34=q12,u4=reg128#8,u4=q7,5z12=reg128#14,5z12=q13,5z34=reg128#15,5z34=q14,5z12=reg128#14,<5z12=reg128#14,5z12=q13,<5z12=q13,5z34=reg128#15,<5z34=reg128#15,5z34=q14,<5z34=q14,ptr=int32#2,ptr=r1,r4=reg128#16,r4=q15,r0=reg128#8,r0=q7,ptr=int32#2,ptr=r1,ptr=int32#2,ptr=r1,ptr=int32#2,ptr=r1,ptr=int32#2,ptr=r1,ptr=int32#2,ptr=r1,ptr=int32#2,ptr=r1,ptr=int32#2,<5y12_stack=stack128#5 +# asm 2: lea >ptr=r1,<5y12_stack=[sp,#64] +add r1,sp,#64 + +# qhasm: mem128[ptr] aligned= 5y12 +# asm 1: vst1.8 {<5y12=reg128#12%bot-<5y12=reg128#12%top},[ptr=int32#2,<5y34_stack=stack128#6 +# asm 2: lea >ptr=r1,<5y34_stack=[sp,#80] +add r1,sp,#80 + +# qhasm: mem128[ptr] aligned= 5y34 +# asm 1: vst1.8 {<5y34=reg128#13%bot-<5y34=reg128#13%top},[ptr=int32#2,<5z12_stack=stack128#10 +# asm 2: lea >ptr=r1,<5z12_stack=[sp,#144] +add r1,sp,#144 + +# qhasm: mem128[ptr] aligned= 5z12 +# asm 1: vst1.8 {<5z12=reg128#14%bot-<5z12=reg128#14%top},[ptr=int32#2,<5z34_stack=stack128#11 +# asm 2: lea >ptr=r1,<5z34_stack=[sp,#160] +add r1,sp,#160 + +# qhasm: mem128[ptr] aligned= 5z34 +# asm 1: vst1.8 {<5z34=reg128#15%bot-<5z34=reg128#15%top},[? len - 64 +# asm 1: cmp +bls ._below64bytes + +# qhasm: input_2 += 32 +# asm 1: add >input_2=int32#2,input_2=r1,c01=reg128#1%bot->c01=reg128#1%top},[c01=d0->c01=d1},[c23=reg128#2%bot->c23=reg128#2%top},[c23=d2->c23=d3},[ptr=int32#3,ptr=r2,z12=reg128#3%bot->z12=reg128#3%top},[z12=d4->z12=d5},[ptr=int32#3,ptr=r2,z0=reg128#4%bot->z0=reg128#4%top},[z0=d6->z0=d7},[r3=reg128#5,r3=q4,input_2=int32#2,input_2=r1,ptr=int32#3,<5z34_stack=stack128#11 +# asm 2: lea >ptr=r2,<5z34_stack=[sp,#160] +add r2,sp,#160 + +# qhasm: 5z34 aligned= mem128[ptr] +# asm 1: vld1.8 {>5z34=reg128#6%bot->5z34=reg128#6%top},[5z34=d10->5z34=d11},[r0=reg128#8,r0=q7,r2=reg128#14,r2=q13,d01=reg128#12%bot->d01=reg128#12%top},[d01=d22->d01=d23},[r1=reg128#15,r1=q14,ptr=int32#3,<5z12_stack=stack128#10 +# asm 2: lea >ptr=r2,<5z12_stack=[sp,#144] +add r2,sp,#144 + +# qhasm: 5z12 aligned= mem128[ptr] +# asm 1: vld1.8 {>5z12=reg128#1%bot->5z12=reg128#1%top},[5z12=d0->5z12=d1},[d23=reg128#2%bot->d23=reg128#2%top},[d23=d2->d23=d3},[input_2=int32#2,input_2=r1,> 40 +# asm 1: vshr.u64 >v4=reg128#4,v4=q3,> 14; v23[3] = d23[2,3] unsigned>> 14 +# asm 1: vshrn.u64 > 26; v01[3] = d01[2,3] unsigned>> 26 +# asm 1: vshrn.u64 > 20; v23[1] = mid[2,3] unsigned>> 20 +# asm 1: vshrn.u64 ptr=int32#3,ptr=r2,y34=reg128#3%bot->y34=reg128#3%top},[y34=d4->y34=d5},[ptr=int32#3,ptr=r2,y12=reg128#2%bot->y12=reg128#2%top},[y12=d2->y12=d3},[ptr=int32#3,ptr=r2,y0=reg128#1%bot->y0=reg128#1%top},[y0=d0->y0=d1},[ptr=int32#3,<5y34_stack=stack128#6 +# asm 2: lea >ptr=r2,<5y34_stack=[sp,#80] +add r2,sp,#80 + +# qhasm: 5y34 aligned= mem128[ptr] +# asm 1: vld1.8 {>5y34=reg128#13%bot->5y34=reg128#13%top},[5y34=d24->5y34=d25},[ptr=int32#3,<5y12_stack=stack128#5 +# asm 2: lea >ptr=r2,<5y12_stack=[sp,#64] +add r2,sp,#64 + +# qhasm: 5y12 aligned= mem128[ptr] +# asm 1: vld1.8 {>5y12=reg128#12%bot->5y12=reg128#12%top},[5y12=d22->5y12=d23},[ptr=int32#3,ptr=r2,> 26 +# asm 1: vshr.u64 >t1=reg128#4,t1=q3,len=int32#4,len=r3,r0=reg128#6,r0=q5,r1=reg128#4,r1=q3,> 26 +# asm 1: vshr.u64 >t4=reg128#8,t4=q7,r3=reg128#5,r3=q4,x4=reg128#8,x4=q7,r4=reg128#16%bot->r4=reg128#16%top},[r4=d30->r4=d31},[> 26 +# asm 1: vshr.u64 >t2=reg128#9,t2=q8,r1=reg128#4,r1=q3,> 26 +# asm 1: vshr.u64 >t0=reg128#10,t0=q9,r2=reg128#9,r2=q8,x4=reg128#11,x4=q10,x01=reg128#6,x01=q5,r0=reg128#8%bot->r0=reg128#8%top},[r0=d14->r0=d15},[ptr=int32#3,ptr=r2,t0=reg128#10,t0=q9,> 26 +# asm 1: vshr.u64 >t3=reg128#14,t3=q13,x01=reg128#15,x01=q14,z34=reg128#6%bot->z34=reg128#6%top},[z34=d10->z34=d11},[x23=reg128#10,x23=q9,r3=reg128#5,r3=q4,input_2=int32#2,input_2=r1,> 26 +# asm 1: vshr.u64 >t1=reg128#14,t1=q13,x01=reg128#9,x01=q8,r1=reg128#4,r1=q3,> 26 +# asm 1: vshr.u64 >t4=reg128#14,t4=q13,r3=reg128#5,r3=q4,x4=reg128#11,x4=q10,? len - 64 +# asm 1: cmp +bhi ._mainloop2 + +# qhasm: input_2 -= 32 +# asm 1: sub >input_2=int32#3,input_2=r2,? len - 32 +# asm 1: cmp +bls ._end + +# qhasm: mainloop: +._mainloop: + +# qhasm: new r0 + +# qhasm: ptr = &two24 +# asm 1: lea >ptr=int32#2,ptr=r1,r4=reg128#5%bot->r4=reg128#5%top},[r4=d8->r4=d9},[u4=reg128#6%bot->u4=reg128#6%top},[u4=d10->u4=d11},[c01=reg128#8%bot->c01=reg128#8%top},[c01=d14->c01=d15},[c23=reg128#14%bot->c23=reg128#14%top},[c23=d26->c23=d27},[r0=reg128#4,r0=q3,r3=reg128#6,r3=q5,r1=reg128#14,r1=q13,r2=reg128#8,r2=q7,> 26 +# asm 1: vshr.u64 >t1=reg128#9,t1=q8,r0=reg128#4,r0=q3,r1=reg128#9,r1=q8,> 26 +# asm 1: vshr.u64 >t4=reg128#10,t4=q9,r3=reg128#6,r3=q5,r4=reg128#5,r4=q4,> 26 +# asm 1: vshr.u64 >t2=reg128#10,t2=q9,r1=reg128#11,r1=q10,> 26 +# asm 1: vshr.u64 >t0=reg128#9,t0=q8,r2=reg128#8,r2=q7,r4=reg128#5,r4=q4,r0=reg128#4,r0=q3,t0=reg128#9,t0=q8,> 26 +# asm 1: vshr.u64 >t3=reg128#14,t3=q13,r0=reg128#4,r0=q3,x23=reg128#10,x23=q9,r3=reg128#6,r3=q5,> 26 +# asm 1: vshr.u64 >t1=reg128#8,t1=q7,x01=reg128#9,x01=q8,r1=reg128#4,r1=q3,> 26 +# asm 1: vshr.u64 >t4=reg128#8,t4=q7,r3=reg128#6,r3=q5,x4=reg128#11,x4=q10,len=int32#4,len=r3,? len - 32 +# asm 1: cmp +bhi ._mainloop + +# qhasm: end: +._end: + +# qhasm: mem128[input_0] = x01;input_0+=16 +# asm 1: vst1.8 {len=int32#1,len=r0,mask=reg128#1,#0xffffffff +# asm 2: vmov.i64 >mask=q0,#0xffffffff +vmov.i64 q0,#0xffffffff + +# qhasm: y01 aligned= mem128[input_2];input_2+=16 +# asm 1: vld1.8 {>y01=reg128#2%bot->y01=reg128#2%top},[y01=d2->y01=d3},[_5y01=reg128#3,_5y01=q2,y23=reg128#4%bot->y23=reg128#4%top},[y23=d6->y23=d7},[_5y23=reg128#9,_5y23=q8,_5y4=reg128#11,_5y4=q10,x01=reg128#12%bot->x01=reg128#12%top},[x01=d22->x01=d23},[_5y01=reg128#3,<_5y01=reg128#3,_5y01=q2,<_5y01=q2,x23=reg128#13%bot->x23=reg128#13%top},[x23=d24->x23=d25},[_5y23=reg128#9,<_5y23=reg128#9,_5y23=q8,<_5y23=q8,_5y4=reg128#11,<_5y4=reg128#11,_5y4=q10,<_5y4=q10,c01=reg128#14%bot->c01=reg128#14%top},[c01=d26->c01=d27},[x01=reg128#12,x01=q11,c23=reg128#14%bot->c23=reg128#14%top},[c23=d26->c23=d27},[x23=reg128#13,x23=q12,>=6 +# asm 1: vshr.u64 >mask=reg128#1,mask=q0,x4=reg128#14,x4=q13,r0=reg128#15,r0=q14,r1=reg128#3,r1=q2,r2=reg128#16,r2=q15,r3=reg128#9,r3=q8,r4=reg128#10,r4=q9,> 26 +# asm 1: vshr.u64 >t1=reg128#2,t1=q1,r0=reg128#4,r0=q3,r1=reg128#2,r1=q1,> 26 +# asm 1: vshr.u64 >t4=reg128#3,t4=q2,r3=reg128#9,r3=q8,r4=reg128#3,r4=q2,> 26 +# asm 1: vshr.u64 >t2=reg128#10,t2=q9,r1=reg128#2,r1=q1,> 26 +# asm 1: vshr.u64 >t0=reg128#11,t0=q10,r2=reg128#10,r2=q9,r4=reg128#3,r4=q2,r0=reg128#4,r0=q3,t0=reg128#11,t0=q10,> 26 +# asm 1: vshr.u64 >t3=reg128#12,t3=q11,r0=reg128#4,r0=q3,x23=reg128#10,x23=q9,r3=reg128#9,r3=q8,> 26 +# asm 1: vshr.u64 >t1=reg128#11,t1=q10,x01=reg128#4,x01=q3,r1=reg128#2,r1=q1,> 26 +# asm 1: vshr.u64 >t4=reg128#11,t4=q10,r3=reg128#1,r3=q0,x4=reg128#3,x4=q2, + +#include "internal.h" +#include "../internal.h" + + +#if defined(BORINGSSL_HAS_UINT128) && defined(OPENSSL_X86_64) + +#pragma GCC diagnostic ignored "-Wcast-align" +#pragma GCC diagnostic ignored "-Wsign-conversion" + +#include + +static uint32_t load_u32_le(const uint8_t in[4]) { + uint32_t ret; + GFp_memcpy(&ret, in, 4); + return ret; +} + +static uint64_t load_u64_le(const uint8_t in[8]) { + uint64_t ret; + GFp_memcpy(&ret, in, 8); + return ret; +} + +static void store_u64_le(uint8_t out[8], uint64_t v) { + GFp_memcpy(out, &v, 8); +} + +typedef __m128i xmmi; + +static const alignas(16) uint32_t poly1305_x64_sse2_message_mask[4] = { + (1 << 26) - 1, 0, (1 << 26) - 1, 0}; +static const alignas(16) uint32_t poly1305_x64_sse2_5[4] = {5, 0, 5, 0}; +static const alignas(16) uint32_t poly1305_x64_sse2_1shl128[4] = { + (1 << 24), 0, (1 << 24), 0}; + +static inline uint128_t add128(uint128_t a, uint128_t b) { return a + b; } + +static inline uint128_t add128_64(uint128_t a, uint64_t b) { return a + b; } + +static inline uint128_t mul64x64_128(uint64_t a, uint64_t b) { + return (uint128_t)a * b; +} + +static inline uint64_t lo128(uint128_t a) { return (uint64_t)a; } + +static inline uint64_t shr128(uint128_t v, const int shift) { + return (uint64_t)(v >> shift); +} + +static inline uint64_t shr128_pair(uint64_t hi, uint64_t lo, const int shift) { + return (uint64_t)((((uint128_t)hi << 64) | lo) >> shift); +} + +typedef struct poly1305_power_t { + union { + xmmi v; + uint64_t u[2]; + uint32_t d[4]; + } R20, R21, R22, R23, R24, S21, S22, S23, S24; +} poly1305_power; + +typedef struct poly1305_state_internal_t { + poly1305_power P[2]; /* 288 bytes, top 32 bit halves unused = 144 + bytes of free storage */ + union { + xmmi H[5]; // 80 bytes + uint64_t HH[10]; + }; + // uint64_t r0,r1,r2; [24 bytes] + // uint64_t pad0,pad1; [16 bytes] + uint64_t started; // 8 bytes + uint64_t leftover; // 8 bytes + uint8_t buffer[64]; // 64 bytes +} poly1305_state_internal; /* 448 bytes total + 63 bytes for + alignment = 511 bytes raw */ + +OPENSSL_STATIC_ASSERT(sizeof(poly1305_state_internal) <= sizeof(poly1305_state), + "poly1305_state isn't large enough to hold aligned poly1305_state_internal"); + +static inline poly1305_state_internal *poly1305_aligned_state( + poly1305_state *state) { + dev_assert_secret(((uintptr_t)state & 63) == 0); + return (poly1305_state_internal *)(((uint64_t)state + 63) & ~63); +} + +static inline size_t poly1305_min(size_t a, size_t b) { + return (a < b) ? a : b; +} + +void GFp_poly1305_init(poly1305_state *state, const uint8_t key[32]) { + poly1305_state_internal *st = poly1305_aligned_state(state); + poly1305_power *p; + uint64_t r0, r1, r2; + uint64_t t0, t1; + + // clamp key + t0 = load_u64_le(key + 0); + t1 = load_u64_le(key + 8); + r0 = t0 & 0xffc0fffffff; + t0 >>= 44; + t0 |= t1 << 20; + r1 = t0 & 0xfffffc0ffff; + t1 >>= 24; + r2 = t1 & 0x00ffffffc0f; + + // store r in un-used space of st->P[1] + p = &st->P[1]; + p->R20.d[1] = (uint32_t)(r0); + p->R20.d[3] = (uint32_t)(r0 >> 32); + p->R21.d[1] = (uint32_t)(r1); + p->R21.d[3] = (uint32_t)(r1 >> 32); + p->R22.d[1] = (uint32_t)(r2); + p->R22.d[3] = (uint32_t)(r2 >> 32); + + // store pad + p->R23.d[1] = load_u32_le(key + 16); + p->R23.d[3] = load_u32_le(key + 20); + p->R24.d[1] = load_u32_le(key + 24); + p->R24.d[3] = load_u32_le(key + 28); + + // H = 0 + st->H[0] = _mm_setzero_si128(); + st->H[1] = _mm_setzero_si128(); + st->H[2] = _mm_setzero_si128(); + st->H[3] = _mm_setzero_si128(); + st->H[4] = _mm_setzero_si128(); + + st->started = 0; + st->leftover = 0; +} + +static void poly1305_first_block(poly1305_state_internal *st, + const uint8_t *m) { + const xmmi MMASK = _mm_load_si128((const xmmi *)poly1305_x64_sse2_message_mask); + const xmmi FIVE = _mm_load_si128((const xmmi *)poly1305_x64_sse2_5); + const xmmi HIBIT = _mm_load_si128((const xmmi *)poly1305_x64_sse2_1shl128); + xmmi T5, T6; + poly1305_power *p; + uint128_t d[3]; + uint64_t r0, r1, r2; + uint64_t r20, r21, r22, s22; + uint64_t pad0, pad1; + uint64_t c; + uint64_t i; + + // pull out stored info + p = &st->P[1]; + + r0 = ((uint64_t)p->R20.d[3] << 32) | (uint64_t)p->R20.d[1]; + r1 = ((uint64_t)p->R21.d[3] << 32) | (uint64_t)p->R21.d[1]; + r2 = ((uint64_t)p->R22.d[3] << 32) | (uint64_t)p->R22.d[1]; + pad0 = ((uint64_t)p->R23.d[3] << 32) | (uint64_t)p->R23.d[1]; + pad1 = ((uint64_t)p->R24.d[3] << 32) | (uint64_t)p->R24.d[1]; + + // compute powers r^2,r^4 + r20 = r0; + r21 = r1; + r22 = r2; + for (i = 0; i < 2; i++) { + s22 = r22 * (5 << 2); + + d[0] = add128(mul64x64_128(r20, r20), mul64x64_128(r21 * 2, s22)); + d[1] = add128(mul64x64_128(r22, s22), mul64x64_128(r20 * 2, r21)); + d[2] = add128(mul64x64_128(r21, r21), mul64x64_128(r22 * 2, r20)); + + r20 = lo128(d[0]) & 0xfffffffffff; + c = shr128(d[0], 44); + d[1] = add128_64(d[1], c); + r21 = lo128(d[1]) & 0xfffffffffff; + c = shr128(d[1], 44); + d[2] = add128_64(d[2], c); + r22 = lo128(d[2]) & 0x3ffffffffff; + c = shr128(d[2], 42); + r20 += c * 5; + c = (r20 >> 44); + r20 = r20 & 0xfffffffffff; + r21 += c; + + p->R20.v = _mm_shuffle_epi32(_mm_cvtsi32_si128((uint32_t)(r20)&0x3ffffff), + _MM_SHUFFLE(1, 0, 1, 0)); + p->R21.v = _mm_shuffle_epi32( + _mm_cvtsi32_si128((uint32_t)((r20 >> 26) | (r21 << 18)) & 0x3ffffff), + _MM_SHUFFLE(1, 0, 1, 0)); + p->R22.v = + _mm_shuffle_epi32(_mm_cvtsi32_si128((uint32_t)((r21 >> 8)) & 0x3ffffff), + _MM_SHUFFLE(1, 0, 1, 0)); + p->R23.v = _mm_shuffle_epi32( + _mm_cvtsi32_si128((uint32_t)((r21 >> 34) | (r22 << 10)) & 0x3ffffff), + _MM_SHUFFLE(1, 0, 1, 0)); + p->R24.v = _mm_shuffle_epi32(_mm_cvtsi32_si128((uint32_t)((r22 >> 16))), + _MM_SHUFFLE(1, 0, 1, 0)); + p->S21.v = _mm_mul_epu32(p->R21.v, FIVE); + p->S22.v = _mm_mul_epu32(p->R22.v, FIVE); + p->S23.v = _mm_mul_epu32(p->R23.v, FIVE); + p->S24.v = _mm_mul_epu32(p->R24.v, FIVE); + p--; + } + + // put saved info back + p = &st->P[1]; + p->R20.d[1] = (uint32_t)(r0); + p->R20.d[3] = (uint32_t)(r0 >> 32); + p->R21.d[1] = (uint32_t)(r1); + p->R21.d[3] = (uint32_t)(r1 >> 32); + p->R22.d[1] = (uint32_t)(r2); + p->R22.d[3] = (uint32_t)(r2 >> 32); + p->R23.d[1] = (uint32_t)(pad0); + p->R23.d[3] = (uint32_t)(pad0 >> 32); + p->R24.d[1] = (uint32_t)(pad1); + p->R24.d[3] = (uint32_t)(pad1 >> 32); + + // H = [Mx,My] + T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 0)), + _mm_loadl_epi64((const xmmi *)(m + 16))); + T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 8)), + _mm_loadl_epi64((const xmmi *)(m + 24))); + st->H[0] = _mm_and_si128(MMASK, T5); + st->H[1] = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26)); + T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12)); + st->H[2] = _mm_and_si128(MMASK, T5); + st->H[3] = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26)); + st->H[4] = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT); +} + +static void poly1305_blocks(poly1305_state_internal *st, const uint8_t *m, + size_t bytes) { + const xmmi MMASK = _mm_load_si128((const xmmi *)poly1305_x64_sse2_message_mask); + const xmmi FIVE = _mm_load_si128((const xmmi *)poly1305_x64_sse2_5); + const xmmi HIBIT = _mm_load_si128((const xmmi *)poly1305_x64_sse2_1shl128); + + poly1305_power *p; + xmmi H0, H1, H2, H3, H4; + xmmi T0, T1, T2, T3, T4, T5, T6; + xmmi M0, M1, M2, M3, M4; + xmmi C1, C2; + + H0 = st->H[0]; + H1 = st->H[1]; + H2 = st->H[2]; + H3 = st->H[3]; + H4 = st->H[4]; + + while (bytes >= 64) { + // H *= [r^4,r^4] + p = &st->P[0]; + T0 = _mm_mul_epu32(H0, p->R20.v); + T1 = _mm_mul_epu32(H0, p->R21.v); + T2 = _mm_mul_epu32(H0, p->R22.v); + T3 = _mm_mul_epu32(H0, p->R23.v); + T4 = _mm_mul_epu32(H0, p->R24.v); + T5 = _mm_mul_epu32(H1, p->S24.v); + T6 = _mm_mul_epu32(H1, p->R20.v); + T0 = _mm_add_epi64(T0, T5); + T1 = _mm_add_epi64(T1, T6); + T5 = _mm_mul_epu32(H2, p->S23.v); + T6 = _mm_mul_epu32(H2, p->S24.v); + T0 = _mm_add_epi64(T0, T5); + T1 = _mm_add_epi64(T1, T6); + T5 = _mm_mul_epu32(H3, p->S22.v); + T6 = _mm_mul_epu32(H3, p->S23.v); + T0 = _mm_add_epi64(T0, T5); + T1 = _mm_add_epi64(T1, T6); + T5 = _mm_mul_epu32(H4, p->S21.v); + T6 = _mm_mul_epu32(H4, p->S22.v); + T0 = _mm_add_epi64(T0, T5); + T1 = _mm_add_epi64(T1, T6); + T5 = _mm_mul_epu32(H1, p->R21.v); + T6 = _mm_mul_epu32(H1, p->R22.v); + T2 = _mm_add_epi64(T2, T5); + T3 = _mm_add_epi64(T3, T6); + T5 = _mm_mul_epu32(H2, p->R20.v); + T6 = _mm_mul_epu32(H2, p->R21.v); + T2 = _mm_add_epi64(T2, T5); + T3 = _mm_add_epi64(T3, T6); + T5 = _mm_mul_epu32(H3, p->S24.v); + T6 = _mm_mul_epu32(H3, p->R20.v); + T2 = _mm_add_epi64(T2, T5); + T3 = _mm_add_epi64(T3, T6); + T5 = _mm_mul_epu32(H4, p->S23.v); + T6 = _mm_mul_epu32(H4, p->S24.v); + T2 = _mm_add_epi64(T2, T5); + T3 = _mm_add_epi64(T3, T6); + T5 = _mm_mul_epu32(H1, p->R23.v); + T4 = _mm_add_epi64(T4, T5); + T5 = _mm_mul_epu32(H2, p->R22.v); + T4 = _mm_add_epi64(T4, T5); + T5 = _mm_mul_epu32(H3, p->R21.v); + T4 = _mm_add_epi64(T4, T5); + T5 = _mm_mul_epu32(H4, p->R20.v); + T4 = _mm_add_epi64(T4, T5); + + // H += [Mx,My]*[r^2,r^2] + T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 0)), + _mm_loadl_epi64((const xmmi *)(m + 16))); + T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 8)), + _mm_loadl_epi64((const xmmi *)(m + 24))); + M0 = _mm_and_si128(MMASK, T5); + M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26)); + T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12)); + M2 = _mm_and_si128(MMASK, T5); + M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26)); + M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT); + + p = &st->P[1]; + T5 = _mm_mul_epu32(M0, p->R20.v); + T6 = _mm_mul_epu32(M0, p->R21.v); + T0 = _mm_add_epi64(T0, T5); + T1 = _mm_add_epi64(T1, T6); + T5 = _mm_mul_epu32(M1, p->S24.v); + T6 = _mm_mul_epu32(M1, p->R20.v); + T0 = _mm_add_epi64(T0, T5); + T1 = _mm_add_epi64(T1, T6); + T5 = _mm_mul_epu32(M2, p->S23.v); + T6 = _mm_mul_epu32(M2, p->S24.v); + T0 = _mm_add_epi64(T0, T5); + T1 = _mm_add_epi64(T1, T6); + T5 = _mm_mul_epu32(M3, p->S22.v); + T6 = _mm_mul_epu32(M3, p->S23.v); + T0 = _mm_add_epi64(T0, T5); + T1 = _mm_add_epi64(T1, T6); + T5 = _mm_mul_epu32(M4, p->S21.v); + T6 = _mm_mul_epu32(M4, p->S22.v); + T0 = _mm_add_epi64(T0, T5); + T1 = _mm_add_epi64(T1, T6); + T5 = _mm_mul_epu32(M0, p->R22.v); + T6 = _mm_mul_epu32(M0, p->R23.v); + T2 = _mm_add_epi64(T2, T5); + T3 = _mm_add_epi64(T3, T6); + T5 = _mm_mul_epu32(M1, p->R21.v); + T6 = _mm_mul_epu32(M1, p->R22.v); + T2 = _mm_add_epi64(T2, T5); + T3 = _mm_add_epi64(T3, T6); + T5 = _mm_mul_epu32(M2, p->R20.v); + T6 = _mm_mul_epu32(M2, p->R21.v); + T2 = _mm_add_epi64(T2, T5); + T3 = _mm_add_epi64(T3, T6); + T5 = _mm_mul_epu32(M3, p->S24.v); + T6 = _mm_mul_epu32(M3, p->R20.v); + T2 = _mm_add_epi64(T2, T5); + T3 = _mm_add_epi64(T3, T6); + T5 = _mm_mul_epu32(M4, p->S23.v); + T6 = _mm_mul_epu32(M4, p->S24.v); + T2 = _mm_add_epi64(T2, T5); + T3 = _mm_add_epi64(T3, T6); + T5 = _mm_mul_epu32(M0, p->R24.v); + T4 = _mm_add_epi64(T4, T5); + T5 = _mm_mul_epu32(M1, p->R23.v); + T4 = _mm_add_epi64(T4, T5); + T5 = _mm_mul_epu32(M2, p->R22.v); + T4 = _mm_add_epi64(T4, T5); + T5 = _mm_mul_epu32(M3, p->R21.v); + T4 = _mm_add_epi64(T4, T5); + T5 = _mm_mul_epu32(M4, p->R20.v); + T4 = _mm_add_epi64(T4, T5); + + // H += [Mx,My] + T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 32)), + _mm_loadl_epi64((const xmmi *)(m + 48))); + T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 40)), + _mm_loadl_epi64((const xmmi *)(m + 56))); + M0 = _mm_and_si128(MMASK, T5); + M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26)); + T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12)); + M2 = _mm_and_si128(MMASK, T5); + M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26)); + M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT); + + T0 = _mm_add_epi64(T0, M0); + T1 = _mm_add_epi64(T1, M1); + T2 = _mm_add_epi64(T2, M2); + T3 = _mm_add_epi64(T3, M3); + T4 = _mm_add_epi64(T4, M4); + + // reduce + C1 = _mm_srli_epi64(T0, 26); + C2 = _mm_srli_epi64(T3, 26); + T0 = _mm_and_si128(T0, MMASK); + T3 = _mm_and_si128(T3, MMASK); + T1 = _mm_add_epi64(T1, C1); + T4 = _mm_add_epi64(T4, C2); + C1 = _mm_srli_epi64(T1, 26); + C2 = _mm_srli_epi64(T4, 26); + T1 = _mm_and_si128(T1, MMASK); + T4 = _mm_and_si128(T4, MMASK); + T2 = _mm_add_epi64(T2, C1); + T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE)); + C1 = _mm_srli_epi64(T2, 26); + C2 = _mm_srli_epi64(T0, 26); + T2 = _mm_and_si128(T2, MMASK); + T0 = _mm_and_si128(T0, MMASK); + T3 = _mm_add_epi64(T3, C1); + T1 = _mm_add_epi64(T1, C2); + C1 = _mm_srli_epi64(T3, 26); + T3 = _mm_and_si128(T3, MMASK); + T4 = _mm_add_epi64(T4, C1); + + // H = (H*[r^4,r^4] + [Mx,My]*[r^2,r^2] + [Mx,My]) + H0 = T0; + H1 = T1; + H2 = T2; + H3 = T3; + H4 = T4; + + m += 64; + bytes -= 64; + } + + st->H[0] = H0; + st->H[1] = H1; + st->H[2] = H2; + st->H[3] = H3; + st->H[4] = H4; +} + +static size_t poly1305_combine(poly1305_state_internal *st, const uint8_t *m, + size_t bytes) { + const xmmi MMASK = _mm_load_si128((const xmmi *)poly1305_x64_sse2_message_mask); + const xmmi HIBIT = _mm_load_si128((const xmmi *)poly1305_x64_sse2_1shl128); + const xmmi FIVE = _mm_load_si128((const xmmi *)poly1305_x64_sse2_5); + + poly1305_power *p; + xmmi H0, H1, H2, H3, H4; + xmmi M0, M1, M2, M3, M4; + xmmi T0, T1, T2, T3, T4, T5, T6; + xmmi C1, C2; + + uint64_t r0, r1, r2; + uint64_t t0, t1, t2, t3, t4; + uint64_t c; + size_t consumed = 0; + + H0 = st->H[0]; + H1 = st->H[1]; + H2 = st->H[2]; + H3 = st->H[3]; + H4 = st->H[4]; + + // p = [r^2,r^2] + p = &st->P[1]; + + if (bytes >= 32) { + // H *= [r^2,r^2] + T0 = _mm_mul_epu32(H0, p->R20.v); + T1 = _mm_mul_epu32(H0, p->R21.v); + T2 = _mm_mul_epu32(H0, p->R22.v); + T3 = _mm_mul_epu32(H0, p->R23.v); + T4 = _mm_mul_epu32(H0, p->R24.v); + T5 = _mm_mul_epu32(H1, p->S24.v); + T6 = _mm_mul_epu32(H1, p->R20.v); + T0 = _mm_add_epi64(T0, T5); + T1 = _mm_add_epi64(T1, T6); + T5 = _mm_mul_epu32(H2, p->S23.v); + T6 = _mm_mul_epu32(H2, p->S24.v); + T0 = _mm_add_epi64(T0, T5); + T1 = _mm_add_epi64(T1, T6); + T5 = _mm_mul_epu32(H3, p->S22.v); + T6 = _mm_mul_epu32(H3, p->S23.v); + T0 = _mm_add_epi64(T0, T5); + T1 = _mm_add_epi64(T1, T6); + T5 = _mm_mul_epu32(H4, p->S21.v); + T6 = _mm_mul_epu32(H4, p->S22.v); + T0 = _mm_add_epi64(T0, T5); + T1 = _mm_add_epi64(T1, T6); + T5 = _mm_mul_epu32(H1, p->R21.v); + T6 = _mm_mul_epu32(H1, p->R22.v); + T2 = _mm_add_epi64(T2, T5); + T3 = _mm_add_epi64(T3, T6); + T5 = _mm_mul_epu32(H2, p->R20.v); + T6 = _mm_mul_epu32(H2, p->R21.v); + T2 = _mm_add_epi64(T2, T5); + T3 = _mm_add_epi64(T3, T6); + T5 = _mm_mul_epu32(H3, p->S24.v); + T6 = _mm_mul_epu32(H3, p->R20.v); + T2 = _mm_add_epi64(T2, T5); + T3 = _mm_add_epi64(T3, T6); + T5 = _mm_mul_epu32(H4, p->S23.v); + T6 = _mm_mul_epu32(H4, p->S24.v); + T2 = _mm_add_epi64(T2, T5); + T3 = _mm_add_epi64(T3, T6); + T5 = _mm_mul_epu32(H1, p->R23.v); + T4 = _mm_add_epi64(T4, T5); + T5 = _mm_mul_epu32(H2, p->R22.v); + T4 = _mm_add_epi64(T4, T5); + T5 = _mm_mul_epu32(H3, p->R21.v); + T4 = _mm_add_epi64(T4, T5); + T5 = _mm_mul_epu32(H4, p->R20.v); + T4 = _mm_add_epi64(T4, T5); + + // H += [Mx,My] + T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 0)), + _mm_loadl_epi64((const xmmi *)(m + 16))); + T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 8)), + _mm_loadl_epi64((const xmmi *)(m + 24))); + M0 = _mm_and_si128(MMASK, T5); + M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26)); + T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12)); + M2 = _mm_and_si128(MMASK, T5); + M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26)); + M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT); + + T0 = _mm_add_epi64(T0, M0); + T1 = _mm_add_epi64(T1, M1); + T2 = _mm_add_epi64(T2, M2); + T3 = _mm_add_epi64(T3, M3); + T4 = _mm_add_epi64(T4, M4); + + // reduce + C1 = _mm_srli_epi64(T0, 26); + C2 = _mm_srli_epi64(T3, 26); + T0 = _mm_and_si128(T0, MMASK); + T3 = _mm_and_si128(T3, MMASK); + T1 = _mm_add_epi64(T1, C1); + T4 = _mm_add_epi64(T4, C2); + C1 = _mm_srli_epi64(T1, 26); + C2 = _mm_srli_epi64(T4, 26); + T1 = _mm_and_si128(T1, MMASK); + T4 = _mm_and_si128(T4, MMASK); + T2 = _mm_add_epi64(T2, C1); + T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE)); + C1 = _mm_srli_epi64(T2, 26); + C2 = _mm_srli_epi64(T0, 26); + T2 = _mm_and_si128(T2, MMASK); + T0 = _mm_and_si128(T0, MMASK); + T3 = _mm_add_epi64(T3, C1); + T1 = _mm_add_epi64(T1, C2); + C1 = _mm_srli_epi64(T3, 26); + T3 = _mm_and_si128(T3, MMASK); + T4 = _mm_add_epi64(T4, C1); + + // H = (H*[r^2,r^2] + [Mx,My]) + H0 = T0; + H1 = T1; + H2 = T2; + H3 = T3; + H4 = T4; + + consumed = 32; + } + + // finalize, H *= [r^2,r] + r0 = ((uint64_t)p->R20.d[3] << 32) | (uint64_t)p->R20.d[1]; + r1 = ((uint64_t)p->R21.d[3] << 32) | (uint64_t)p->R21.d[1]; + r2 = ((uint64_t)p->R22.d[3] << 32) | (uint64_t)p->R22.d[1]; + + p->R20.d[2] = (uint32_t)(r0)&0x3ffffff; + p->R21.d[2] = (uint32_t)((r0 >> 26) | (r1 << 18)) & 0x3ffffff; + p->R22.d[2] = (uint32_t)((r1 >> 8)) & 0x3ffffff; + p->R23.d[2] = (uint32_t)((r1 >> 34) | (r2 << 10)) & 0x3ffffff; + p->R24.d[2] = (uint32_t)((r2 >> 16)); + p->S21.d[2] = p->R21.d[2] * 5; + p->S22.d[2] = p->R22.d[2] * 5; + p->S23.d[2] = p->R23.d[2] * 5; + p->S24.d[2] = p->R24.d[2] * 5; + + // H *= [r^2,r] + T0 = _mm_mul_epu32(H0, p->R20.v); + T1 = _mm_mul_epu32(H0, p->R21.v); + T2 = _mm_mul_epu32(H0, p->R22.v); + T3 = _mm_mul_epu32(H0, p->R23.v); + T4 = _mm_mul_epu32(H0, p->R24.v); + T5 = _mm_mul_epu32(H1, p->S24.v); + T6 = _mm_mul_epu32(H1, p->R20.v); + T0 = _mm_add_epi64(T0, T5); + T1 = _mm_add_epi64(T1, T6); + T5 = _mm_mul_epu32(H2, p->S23.v); + T6 = _mm_mul_epu32(H2, p->S24.v); + T0 = _mm_add_epi64(T0, T5); + T1 = _mm_add_epi64(T1, T6); + T5 = _mm_mul_epu32(H3, p->S22.v); + T6 = _mm_mul_epu32(H3, p->S23.v); + T0 = _mm_add_epi64(T0, T5); + T1 = _mm_add_epi64(T1, T6); + T5 = _mm_mul_epu32(H4, p->S21.v); + T6 = _mm_mul_epu32(H4, p->S22.v); + T0 = _mm_add_epi64(T0, T5); + T1 = _mm_add_epi64(T1, T6); + T5 = _mm_mul_epu32(H1, p->R21.v); + T6 = _mm_mul_epu32(H1, p->R22.v); + T2 = _mm_add_epi64(T2, T5); + T3 = _mm_add_epi64(T3, T6); + T5 = _mm_mul_epu32(H2, p->R20.v); + T6 = _mm_mul_epu32(H2, p->R21.v); + T2 = _mm_add_epi64(T2, T5); + T3 = _mm_add_epi64(T3, T6); + T5 = _mm_mul_epu32(H3, p->S24.v); + T6 = _mm_mul_epu32(H3, p->R20.v); + T2 = _mm_add_epi64(T2, T5); + T3 = _mm_add_epi64(T3, T6); + T5 = _mm_mul_epu32(H4, p->S23.v); + T6 = _mm_mul_epu32(H4, p->S24.v); + T2 = _mm_add_epi64(T2, T5); + T3 = _mm_add_epi64(T3, T6); + T5 = _mm_mul_epu32(H1, p->R23.v); + T4 = _mm_add_epi64(T4, T5); + T5 = _mm_mul_epu32(H2, p->R22.v); + T4 = _mm_add_epi64(T4, T5); + T5 = _mm_mul_epu32(H3, p->R21.v); + T4 = _mm_add_epi64(T4, T5); + T5 = _mm_mul_epu32(H4, p->R20.v); + T4 = _mm_add_epi64(T4, T5); + + C1 = _mm_srli_epi64(T0, 26); + C2 = _mm_srli_epi64(T3, 26); + T0 = _mm_and_si128(T0, MMASK); + T3 = _mm_and_si128(T3, MMASK); + T1 = _mm_add_epi64(T1, C1); + T4 = _mm_add_epi64(T4, C2); + C1 = _mm_srli_epi64(T1, 26); + C2 = _mm_srli_epi64(T4, 26); + T1 = _mm_and_si128(T1, MMASK); + T4 = _mm_and_si128(T4, MMASK); + T2 = _mm_add_epi64(T2, C1); + T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE)); + C1 = _mm_srli_epi64(T2, 26); + C2 = _mm_srli_epi64(T0, 26); + T2 = _mm_and_si128(T2, MMASK); + T0 = _mm_and_si128(T0, MMASK); + T3 = _mm_add_epi64(T3, C1); + T1 = _mm_add_epi64(T1, C2); + C1 = _mm_srli_epi64(T3, 26); + T3 = _mm_and_si128(T3, MMASK); + T4 = _mm_add_epi64(T4, C1); + + // H = H[0]+H[1] + H0 = _mm_add_epi64(T0, _mm_srli_si128(T0, 8)); + H1 = _mm_add_epi64(T1, _mm_srli_si128(T1, 8)); + H2 = _mm_add_epi64(T2, _mm_srli_si128(T2, 8)); + H3 = _mm_add_epi64(T3, _mm_srli_si128(T3, 8)); + H4 = _mm_add_epi64(T4, _mm_srli_si128(T4, 8)); + + t0 = _mm_cvtsi128_si32(H0); + c = (t0 >> 26); + t0 &= 0x3ffffff; + t1 = _mm_cvtsi128_si32(H1) + c; + c = (t1 >> 26); + t1 &= 0x3ffffff; + t2 = _mm_cvtsi128_si32(H2) + c; + c = (t2 >> 26); + t2 &= 0x3ffffff; + t3 = _mm_cvtsi128_si32(H3) + c; + c = (t3 >> 26); + t3 &= 0x3ffffff; + t4 = _mm_cvtsi128_si32(H4) + c; + c = (t4 >> 26); + t4 &= 0x3ffffff; + t0 = t0 + (c * 5); + c = (t0 >> 26); + t0 &= 0x3ffffff; + t1 = t1 + c; + + st->HH[0] = ((t0) | (t1 << 26)) & UINT64_C(0xfffffffffff); + st->HH[1] = ((t1 >> 18) | (t2 << 8) | (t3 << 34)) & UINT64_C(0xfffffffffff); + st->HH[2] = ((t3 >> 10) | (t4 << 16)) & UINT64_C(0x3ffffffffff); + + return consumed; +} + +void GFp_poly1305_update(poly1305_state *state, const uint8_t *m, + size_t bytes) { + poly1305_state_internal *st = poly1305_aligned_state(state); + size_t want; + + // Work around a C language bug. See https://crbug.com/1019588. + if (bytes == 0) { + return; + } + + // need at least 32 initial bytes to start the accelerated branch + if (!st->started) { + if ((st->leftover == 0) && (bytes > 32)) { + poly1305_first_block(st, m); + m += 32; + bytes -= 32; + } else { + want = poly1305_min(32 - st->leftover, bytes); + GFp_memcpy(st->buffer + st->leftover, m, want); + bytes -= want; + m += want; + st->leftover += want; + if ((st->leftover < 32) || (bytes == 0)) { + return; + } + poly1305_first_block(st, st->buffer); + st->leftover = 0; + } + st->started = 1; + } + + // handle leftover + if (st->leftover) { + want = poly1305_min(64 - st->leftover, bytes); + GFp_memcpy(st->buffer + st->leftover, m, want); + bytes -= want; + m += want; + st->leftover += want; + if (st->leftover < 64) { + return; + } + poly1305_blocks(st, st->buffer, 64); + st->leftover = 0; + } + + // process 64 byte blocks + if (bytes >= 64) { + want = (bytes & ~63); + poly1305_blocks(st, m, want); + m += want; + bytes -= want; + } + + if (bytes) { + GFp_memcpy(st->buffer + st->leftover, m, bytes); + st->leftover += bytes; + } +} + +void GFp_poly1305_finish(poly1305_state *state, uint8_t mac[16]) { + poly1305_state_internal *st = poly1305_aligned_state(state); + size_t leftover = st->leftover; + uint8_t *m = st->buffer; + uint128_t d[3]; + uint64_t h0, h1, h2; + uint64_t t0, t1; + uint64_t g0, g1, g2, c, nc; + uint64_t r0, r1, r2, s1, s2; + poly1305_power *p; + + if (st->started) { + size_t consumed = poly1305_combine(st, m, leftover); + leftover -= consumed; + m += consumed; + } + + // st->HH will either be 0 or have the combined result + h0 = st->HH[0]; + h1 = st->HH[1]; + h2 = st->HH[2]; + + p = &st->P[1]; + r0 = ((uint64_t)p->R20.d[3] << 32) | (uint64_t)p->R20.d[1]; + r1 = ((uint64_t)p->R21.d[3] << 32) | (uint64_t)p->R21.d[1]; + r2 = ((uint64_t)p->R22.d[3] << 32) | (uint64_t)p->R22.d[1]; + s1 = r1 * (5 << 2); + s2 = r2 * (5 << 2); + + if (leftover < 16) { + goto poly1305_donna_atmost15bytes; + } + +poly1305_donna_atleast16bytes: + t0 = load_u64_le(m + 0); + t1 = load_u64_le(m + 8); + h0 += t0 & 0xfffffffffff; + t0 = shr128_pair(t1, t0, 44); + h1 += t0 & 0xfffffffffff; + h2 += (t1 >> 24) | ((uint64_t)1 << 40); + +poly1305_donna_mul: + d[0] = add128(add128(mul64x64_128(h0, r0), mul64x64_128(h1, s2)), + mul64x64_128(h2, s1)); + d[1] = add128(add128(mul64x64_128(h0, r1), mul64x64_128(h1, r0)), + mul64x64_128(h2, s2)); + d[2] = add128(add128(mul64x64_128(h0, r2), mul64x64_128(h1, r1)), + mul64x64_128(h2, r0)); + h0 = lo128(d[0]) & 0xfffffffffff; + c = shr128(d[0], 44); + d[1] = add128_64(d[1], c); + h1 = lo128(d[1]) & 0xfffffffffff; + c = shr128(d[1], 44); + d[2] = add128_64(d[2], c); + h2 = lo128(d[2]) & 0x3ffffffffff; + c = shr128(d[2], 42); + h0 += c * 5; + + m += 16; + leftover -= 16; + if (leftover >= 16) { + goto poly1305_donna_atleast16bytes; + } + +// final bytes +poly1305_donna_atmost15bytes: + if (!leftover) { + goto poly1305_donna_finish; + } + + m[leftover++] = 1; + GFp_memset(m + leftover, 0, 16 - leftover); + leftover = 16; + + t0 = load_u64_le(m + 0); + t1 = load_u64_le(m + 8); + h0 += t0 & 0xfffffffffff; + t0 = shr128_pair(t1, t0, 44); + h1 += t0 & 0xfffffffffff; + h2 += (t1 >> 24); + + goto poly1305_donna_mul; + +poly1305_donna_finish: + c = (h0 >> 44); + h0 &= 0xfffffffffff; + h1 += c; + c = (h1 >> 44); + h1 &= 0xfffffffffff; + h2 += c; + c = (h2 >> 42); + h2 &= 0x3ffffffffff; + h0 += c * 5; + + g0 = h0 + 5; + c = (g0 >> 44); + g0 &= 0xfffffffffff; + g1 = h1 + c; + c = (g1 >> 44); + g1 &= 0xfffffffffff; + g2 = h2 + c - ((uint64_t)1 << 42); + + c = (g2 >> 63) - 1; + nc = ~c; + h0 = (h0 & nc) | (g0 & c); + h1 = (h1 & nc) | (g1 & c); + h2 = (h2 & nc) | (g2 & c); + + // pad + t0 = ((uint64_t)p->R23.d[3] << 32) | (uint64_t)p->R23.d[1]; + t1 = ((uint64_t)p->R24.d[3] << 32) | (uint64_t)p->R24.d[1]; + h0 += (t0 & 0xfffffffffff); + c = (h0 >> 44); + h0 &= 0xfffffffffff; + t0 = shr128_pair(t1, t0, 44); + h1 += (t0 & 0xfffffffffff) + c; + c = (h1 >> 44); + h1 &= 0xfffffffffff; + t1 = (t1 >> 24); + h2 += (t1)+c; + + store_u64_le(mac + 0, ((h0) | (h1 << 44))); + store_u64_le(mac + 8, ((h1 >> 20) | (h2 << 24))); +} + +#endif // BORINGSSL_HAS_UINT128 && OPENSSL_X86_64 diff --git a/deny.toml b/deny.toml new file mode 100644 index 0000000000..dcfb03439f --- /dev/null +++ b/deny.toml @@ -0,0 +1,30 @@ +[advisories] +unmaintained = "deny" +yanked = "deny" +notice = "deny" + +[licenses] +allow = [ + "Apache-2.0", + "ISC", + "LicenseRef-ring", + "MIT", +] +confidence-threshold = 1.0 + +[[licenses.clarify]] +name = "ring" +expression = "LicenseRef-ring" +license-files = [ + { path = "LICENSE", hash = 0xbd0eed23 }, +] + +[bans] +# We don't maintain a fixed Cargo.lock so enforcing +# `multiple-versions = "deny"` is impractical. +multiple-versions = "allow" +wildcards = "deny" + +[sources] +unknown-registry = "deny" +unknown-git = "deny" diff --git a/include/GFp/.gitattributes b/include/GFp/.gitattributes deleted file mode 100644 index 15a5c58091..0000000000 --- a/include/GFp/.gitattributes +++ /dev/null @@ -1 +0,0 @@ -*.h linguist-language=C diff --git a/include/GFp/arm_arch.h b/include/GFp/arm_arch.h index ee5e32c8f3..2e64aa9e5e 100644 --- a/include/GFp/arm_arch.h +++ b/include/GFp/arm_arch.h @@ -110,4 +110,68 @@ // ARMV8_SHA256 indicates support for hardware SHA-256 instructions. #define ARMV8_SHA256 (1 << 4) +#if defined(__ASSEMBLER__) + +#if defined(__GNUC__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wundef" +#endif + +// Support macros for +// - Armv8.3-A Pointer Authentication and +// - Armv8.5-A Branch Target Identification +// features which require emitting a .note.gnu.property section with the +// appropriate architecture-dependent feature bits set. +// Read more: "ELF for the Arm® 64-bit Architecture" + +#if defined(__ARM_FEATURE_BTI_DEFAULT) && __ARM_FEATURE_BTI_DEFAULT == 1 +#define GNU_PROPERTY_AARCH64_BTI (1 << 0) // Has Branch Target Identification +#define AARCH64_VALID_CALL_TARGET hint #34 // BTI 'c' +#else +#define GNU_PROPERTY_AARCH64_BTI 0 // No Branch Target Identification +#define AARCH64_VALID_CALL_TARGET +#endif + +#if defined(__ARM_FEATURE_PAC_DEFAULT) && \ + (__ARM_FEATURE_PAC_DEFAULT & 1) == 1 // Signed with A-key +#define GNU_PROPERTY_AARCH64_POINTER_AUTH \ + (1 << 1) // Has Pointer Authentication +#define AARCH64_SIGN_LINK_REGISTER hint #25 // PACIASP +#define AARCH64_VALIDATE_LINK_REGISTER hint #29 // AUTIASP +#elif defined(__ARM_FEATURE_PAC_DEFAULT) && \ + (__ARM_FEATURE_PAC_DEFAULT & 2) == 2 // Signed with B-key +#define GNU_PROPERTY_AARCH64_POINTER_AUTH \ + (1 << 1) // Has Pointer Authentication +#define AARCH64_SIGN_LINK_REGISTER hint #27 // PACIBSP +#define AARCH64_VALIDATE_LINK_REGISTER hint #31 // AUTIBSP +#else +#define GNU_PROPERTY_AARCH64_POINTER_AUTH 0 // No Pointer Authentication +#if GNU_PROPERTY_AARCH64_BTI != 0 +#define AARCH64_SIGN_LINK_REGISTER AARCH64_VALID_CALL_TARGET +#else +#define AARCH64_SIGN_LINK_REGISTER +#endif +#define AARCH64_VALIDATE_LINK_REGISTER +#endif + +#if GNU_PROPERTY_AARCH64_POINTER_AUTH != 0 || GNU_PROPERTY_AARCH64_BTI != 0 +.pushsection .note.gnu.property, "a"; +.balign 8; +.long 4; +.long 0x10; +.long 0x5; +.asciz "GNU"; +.long 0xc0000000; /* GNU_PROPERTY_AARCH64_FEATURE_1_AND */ +.long 4; +.long (GNU_PROPERTY_AARCH64_POINTER_AUTH | GNU_PROPERTY_AARCH64_BTI); +.long 0; +.popsection; +#endif + +#if defined(__GNUC__) +#pragma GCC diagnostic pop +#endif + +#endif /* defined __ASSEMBLER__ */ + #endif // OPENSSL_HEADER_ARM_ARCH_H diff --git a/include/GFp/check.h b/include/GFp/check.h index cf44db834d..4bd257ca35 100644 --- a/include/GFp/check.h +++ b/include/GFp/check.h @@ -17,12 +17,13 @@ // |debug_assert_nonsecret| is like |assert| and should be used (only) when the // assertion does not have any potential to leak a secret. |NDEBUG| controls this -// exactly like |assert|. It is emulated for WebAssembly so that is -// not required for it. +// exactly like |assert|. It is emulated when there is no assert.h to make +// cross-building easier. // // When reviewing uses of |debug_assert_nonsecret|, verify that the check // really does not have potential to leak a secret. -#if !defined(__wasm__) + +#if !defined(GFp_NOSTDLIBINC) # include # define debug_assert_nonsecret(x) assert(x) #else diff --git a/include/GFp/poly1305.h b/include/GFp/poly1305.h new file mode 100644 index 0000000000..53c4036c86 --- /dev/null +++ b/include/GFp/poly1305.h @@ -0,0 +1,23 @@ +/* Copyright (c) 2014, Google Inc. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY + * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ + +#ifndef OPENSSL_HEADER_POLY1305_H +#define OPENSSL_HEADER_POLY1305_H + +#include + +// Keep in sync with `poly1305_state` in poly1305.rs. +typedef uint8_t poly1305_state[512]; + +#endif // OPENSSL_HEADER_POLY1305_H diff --git a/mk/appveyor.bat b/mk/appveyor.bat deleted file mode 100644 index ac7c2b713f..0000000000 --- a/mk/appveyor.bat +++ /dev/null @@ -1,60 +0,0 @@ -echo on -SetLocal EnableDelayedExpansion - -set VCVARSALL="C:\Program Files (x86)\Microsoft Visual Studio %TOOLCHAIN_VERSION%\VC\vcvarsall.bat" - -if [%Platform%] NEQ [x64] goto win32 -set TARGET_ARCH=x86_64 -goto download - -:win32 -echo on -if [%Platform%] NEQ [Win32] exit 1 -set TARGET_ARCH=i686 -goto download - -:download -REM vcvarsall turns echo off -echo on - -mkdir windows_build_tools -mkdir windows_build_tools\ -echo Downloading Yasm... -powershell -Command "(New-Object Net.WebClient).DownloadFile('https://www.tortall.net/projects/yasm/releases/yasm-1.3.0-win64.exe', 'windows_build_tools\yasm.exe')" -if %ERRORLEVEL% NEQ 0 ( - echo ...downloading Yasm failed. - exit 1 -) - -mkdir build -set RUSTUP_URL=https://win.rustup.rs/%TARGET_ARCH% -set RUSTUP_EXE=build\rustup-init-%TARGET_ARCH%.exe -echo Downloading %RUSTUP_URL%... -powershell -Command "(New-Object Net.WebClient).DownloadFile('%RUSTUP_URL%', '%RUSTUP_EXE%')" -if %ERRORLEVEL% NEQ 0 ( - echo ...downloading rustup failed. - exit 1 -) - -set TARGET=%TARGET_ARCH%-pc-windows-msvc -%RUSTUP_EXE% -y --default-host %TARGET% --default-toolchain %RUST% -if %ERRORLEVEL% NEQ 0 exit 1 - -set PATH=%USERPROFILE%\.cargo\bin;%cd%\windows_build_tools;%PATH% - -if [%Configuration%] == [Release] set CARGO_MODE=--release - -set - -link /? -cl /? -rustc --version -cargo --version - -cargo test -vv %CARGO_MODE% -if %ERRORLEVEL% NEQ 0 exit 1 - -REM Verify that `cargo build`, independent from `cargo test`, works; i.e. -REM verify that non-test builds aren't trying to use test-only features. -cargo build -vv %CARGO_MODE% -if %ERRORLEVEL% NEQ 0 exit 1 diff --git a/mk/cargo.sh b/mk/cargo.sh new file mode 100755 index 0000000000..a7b8154baf --- /dev/null +++ b/mk/cargo.sh @@ -0,0 +1,153 @@ +#!/usr/bin/env bash +# +# Copyright 2020 Brian Smith. +# +# Permission to use, copy, modify, and/or distribute this software for any +# purpose with or without fee is hereby granted, provided that the above +# copyright notice and this permission notice appear in all copies. +# +# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES +# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY +# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +set -eux -o pipefail +IFS=$'\n\t' + +rustflags_self_contained="-Clink-self-contained=yes -Clinker=rust-lld" +qemu_aarch64="qemu-aarch64 -L /usr/aarch64-linux-gnu" +qemu_arm="qemu-arm -L /usr/arm-linux-gnueabihf" + +# Avoid putting the Android tools in `$PATH` because there are tools in this +# directory like `clang` that would conflict with the same-named tools that may +# be needed to compile the build script, or to compile for other targets. +if [ -n "${ANDROID_SDK_ROOT-}" ]; then + android_tools=$ANDROID_SDK_ROOT/ndk-bundle/toolchains/llvm/prebuilt/linux-x86_64/bin +fi + +for arg in $*; do + case $arg in + --target=*) + target=${arg#*=} + ;; + *) + ;; + esac +done + +# See comments in install-build-tools.sh. +llvm_version=10 +if [ -n "${RING_COVERAGE-}" ]; then + llvm_version=11 +fi + +case $target in + aarch64-linux-android) + export CC_aarch64_linux_android=$android_tools/aarch64-linux-android21-clang + export AR_aarch64_linux_android=$android_tools/aarch64-linux-android-ar + export CARGO_TARGET_AARCH64_LINUX_ANDROID_LINKER=$android_tools/aarch64-linux-android21-clang + ;; + aarch64-unknown-linux-gnu) + export CC_aarch64_unknown_linux_gnu=clang-$llvm_version + export AR_aarch64_unknown_linux_gnu=llvm-ar-$llvm_version + export CFLAGS_aarch64_unknown_linux_gnu="--sysroot=/usr/aarch64-linux-gnu" + export CARGO_TARGET_AARCH64_UNKNOWN_LINUX_GNU_LINKER=aarch64-linux-gnu-gcc + export CARGO_TARGET_AARCH64_UNKNOWN_LINUX_GNU_RUNNER="$qemu_aarch64" + ;; + aarch64-unknown-linux-musl) + export CC_aarch64_unknown_linux_musl=clang-$llvm_version + export AR_aarch64_unknown_linux_musl=llvm-ar-$llvm_version + export CARGO_TARGET_AARCH64_UNKNOWN_LINUX_MUSL_RUSTFLAGS="$rustflags_self_contained" + export CARGO_TARGET_AARCH64_UNKNOWN_LINUX_MUSL_RUNNER="$qemu_aarch64" + ;; + arm-unknown-linux-gnueabihf) + export CC_arm_unknown_linux_gnueabihf=arm-linux-gnueabihf-gcc + export AR_arm_unknown_linux_gnueabihf=arm-linux-gnueabihf-gcc-ar + export CARGO_TARGET_ARM_UNKNOWN_LINUX_GNUEABIHF_LINKER=arm-linux-gnueabihf-gcc + export CARGO_TARGET_ARM_UNKNOWN_LINUX_GNUEABIHF_RUNNER="$qemu_arm" + ;; + armv7-linux-androideabi) + export CC_armv7_linux_androideabi=$android_tools/armv7a-linux-androideabi18-clang + export AR_armv7_linux_androideabi=$android_tools/arm-linux-androideabi-ar + export CARGO_TARGET_ARMV7_LINUX_ANDROIDEABI_LINKER=$android_tools/armv7a-linux-androideabi18-clang + ;; + armv7-unknown-linux-musleabihf) + export CC_armv7_unknown_linux_musleabihf=clang-$llvm_version + export AR_armv7_unknown_linux_musleabihf=llvm-ar-$llvm_version + export CARGO_TARGET_ARMV7_UNKNOWN_LINUX_MUSLEABIHF_RUSTFLAGS="$rustflags_self_contained" + export CARGO_TARGET_ARMV7_UNKNOWN_LINUX_MUSLEABIHF_RUNNER="$qemu_arm" + ;; + i686-unknown-linux-gnu) + export CC_i686_unknown_linux_gnu=clang-$llvm_version + export AR_i686_unknown_linux_gnu=llvm-ar-$llvm_version + export CARGO_TARGET_I686_UNKNOWN_LINUX_GNU_LINKER=clang-$llvm_version + ;; + i686-unknown-linux-musl) + export CC_i686_unknown_linux_musl=clang-$llvm_version + export AR_i686_unknown_linux_musl=llvm-ar-$llvm_version + export CARGO_TARGET_I686_UNKNOWN_LINUX_MUSL_RUSTFLAGS="$rustflags_self_contained" + ;; + x86_64-unknown-linux-musl) + export CC_x86_64_unknown_linux_musl=clang-$llvm_version + export AR_x86_64_unknown_linux_musl=llvm-ar-$llvm_version + # XXX: Work around https://github.com/rust-lang/rust/issues/79555. + if [ -n "${RING_COVERAGE-}" ]; then + export CARGO_TARGET_X86_64_UNKNOWN_LINUX_MUSL_LINKER=clang-$llvm_version + else + export CARGO_TARGET_X86_64_UNKNOWN_LINUX_MUSL_RUSTFLAGS="$rustflags_self_contained" + fi + ;; + wasm32-unknown-unknown) + # The first two are only needed for when the "wasm_c" feature is enabled. + export CC_wasm32_unknown_unknown=clang-$llvm_version + export AR_wasm32_unknown_unknown=llvm-ar-$llvm_version + export CARGO_TARGET_WASM32_UNKNOWN_UNKNOWN_RUNNER=wasm-bindgen-test-runner + ;; + *) + ;; +esac + +if [ -n "${RING_COVERAGE-}" ]; then + # XXX: Collides between release and debug. + coverage_dir=$PWD/target/$target/debug/coverage + mkdir -p "$coverage_dir" + rm -f "$coverage_dir/*.profraw" + + export RING_BUILD_EXECUTABLE_LIST="$coverage_dir/executables" + truncate --size=0 "$RING_BUILD_EXECUTABLE_LIST" + + # This doesn't work when profiling under QEMU. Instead mk/runner does + # something similar but different. + # export LLVM_PROFILE_FILE="$coverage_dir/%m.profraw" + + # ${target} with hyphens replaced by underscores, lowercase and uppercase. + target_lower=${target//-/_} + target_upper=${target_lower^^} + + cflags_var=CFLAGS_${target_lower} + declare -x "${cflags_var}=-fprofile-instr-generate -fcoverage-mapping ${!cflags_var-}" + + runner_var=CARGO_TARGET_${target_upper}_RUNNER + declare -x "${runner_var}=mk/runner ${!runner_var-}" + + rustflags_var=CARGO_TARGET_${target_upper}_RUSTFLAGS + declare -x "${rustflags_var}=-Zinstrument-coverage ${!rustflags_var-}" +fi + +cargo "$@" + +if [ -n "${RING_COVERAGE-}" ]; then + while read executable; do + basename=$(basename "$executable") + llvm-profdata-$llvm_version merge -sparse ""$coverage_dir"/$basename.profraw" -o "$coverage_dir"/$basename.profdata + mkdir -p "$coverage_dir"/reports + llvm-cov-$llvm_version export \ + --instr-profile "$coverage_dir"/$basename.profdata \ + --format lcov \ + "$executable" \ + > "$coverage_dir"/reports/coverage-$basename.txt + done < "$RING_BUILD_EXECUTABLE_LIST" +fi diff --git a/mk/install-build-tools.ps1 b/mk/install-build-tools.ps1 new file mode 100644 index 0000000000..f1d51b981a --- /dev/null +++ b/mk/install-build-tools.ps1 @@ -0,0 +1,67 @@ +function Verify-Or-Delete-File { + param ( + [Parameter(Mandatory)] + [string]$File, + [Parameter(Mandatory)] + [string]$ExpectedDigest + ) + $ActualDigest = ( Get-FileHash -Algorithm SHA256 $File ).Hash + if ( $ActualDigest -eq $ExpectedDigest ) + { + return + } + rm $File + echo "Digest verification failed for $Url; actual $ActualDigest, expected $ExpectedDigest" + exit 1 +} + +function Download-Zip-and-Extract-File { + param ( + [Parameter(Mandatory)] + [string]$Uri, + [Parameter(Mandatory)] + [string]$ZipExpectedDigest, + [Parameter(Mandatory)] + [string]$PathWithinZip, + [Parameter(Mandatory)] + [string]$FileExpectedDigest, + [Parameter(Mandatory)] + [string]$OutFile + ) + $TmpZip = New-TemporaryFile + Invoke-WebRequest -Uri $Uri -OutFile $TmpZip.FullName + echo $TmpZip + Verify-Or-Delete-File -File $TmpZip.FullName -ExpectedDigest $ZipExpectedDigest + + Add-Type -AssemblyName System.IO.Compression.FileSystem + $zip = [System.IO.Compression.ZipFile]::OpenRead($TmpZip) + $zip.Entries | + Where-Object { $_.FullName -eq $PathWithinZip } | + ForEach-Object { + $TmpFile = New-TemporaryFile + # extract the selected items from the ZIP archive + # and copy them to the out folder + $FileName = $_.Name + [System.IO.Compression.ZipFileExtensions]::ExtractToFile($_, "$TmpFile", $true) + Verify-Or-Delete-File -File $TmpFile -ExpectedDigest $FileExpectedDigest + Move-Item -Force $TmpFile $OutFile + } + $zip.Dispose() +} + +$tools_dir = "target/tools" +mkdir -Force $tools_dir + +# This is the file BoringSSL refers to in +# https://boringssl.googlesource.com/boringssl/+/26f8297177ad8033cc39de84afe9c2000430a66d. +$nasm_version = "nasm-2.13.03" +$nasm_zip = "$nasm_version-win64.zip" +$nasm_zip_sha256 = "B3A1F896B53D07854884C2E0D6BE7DEFBA7EBD09B864BBB9E6D69ADA1C3E989F" +$nasm_exe = "nasm.exe" +$nasm_exe_sha256 = "D8A933BF5CC3597C56193135CB78B225AB225E1F611D2FDB51EF6E3F555B21E3" +Download-Zip-and-Extract-File ` + -Uri "https://www.nasm.us/pub/nasm/releasebuilds/2.13.03/win64/$nasm_zip" ` + -ZipExpectedDigest "$nasm_zip_sha256" ` + -PathWithinZip "$nasm_version/$nasm_exe" ` + -FileExpectedDigest "$nasm_exe_sha256" ` + -OutFile "$tools_dir/$nasm_exe" diff --git a/mk/install-build-tools.sh b/mk/install-build-tools.sh new file mode 100755 index 0000000000..e997bbb40e --- /dev/null +++ b/mk/install-build-tools.sh @@ -0,0 +1,95 @@ +#!/usr/bin/env bash +# +# Copyright 2020 Brian Smith. +# +# Permission to use, copy, modify, and/or distribute this software for any +# purpose with or without fee is hereby granted, provided that the above +# copyright notice and this permission notice appear in all copies. +# +# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES +# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY +# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +set -eux -o pipefail +IFS=$'\n\t' + +target=$1 +features=${2-} + +function install_packages { + sudo apt-get -yq --no-install-suggests --no-install-recommends install "$@" +} + +use_clang= +case $target in +--target*android*) + mkdir -p "${ANDROID_SDK_ROOT}/licenses" + android_license_file="${ANDROID_SDK_ROOT}/licenses/android-sdk-license" + accept_android_license=24333f8a63b6825ea9c5514f83c2829b004d1fee + grep --quiet --no-messages "$accept_android_license" "$android_license_file" \ + || echo $accept_android_license >> "$android_license_file" + sudo "${ANDROID_SDK_ROOT}/tools/bin/sdkmanager" ndk-bundle + ;; +esac + +case $target in +--target=aarch64-unknown-linux-gnu) + # Clang is needed for code coverage. + use_clang=1 + install_packages \ + qemu-user \ + gcc-aarch64-linux-gnu \ + libc6-dev-arm64-cross + ;; +--target=aarch64-unknown-linux-musl|--target=armv7-unknown-linux-musleabihf) + use_clang=1 + install_packages \ + qemu-user + ;; +--target=arm-unknown-linux-gnueabihf) + install_packages \ + qemu-user \ + gcc-arm-linux-gnueabihf \ + libc6-dev-armhf-cross + ;; +--target=i686-unknown-linux-gnu) + use_clang=1 + install_packages \ + gcc-multilib \ + libc6-dev-i386 + ;; +--target=i686-unknown-linux-musl|--target=x86_64-unknown-linux-musl) + use_clang=1 + ;; +--target=wasm32-unknown-unknown) + # The version of wasm-bindgen-cli must match the wasm-bindgen version. + wasm_bindgen_version=$(cargo metadata --format-version 1 | jq -r '.packages | map(select( .name == "wasm-bindgen")) | map(.version) | .[0]') + cargo install wasm-bindgen-cli --vers "$wasm_bindgen_version" --bin wasm-bindgen-test-runner + case ${features-} in + *wasm32_c*) + use_clang=1 + ;; + *) + ;; + esac + ;; +--target=*) + ;; +esac + +if [ -n "$use_clang" ]; then + llvm_version=10 + if [ -n "${RING_COVERAGE-}" ]; then + # https://github.com/rust-lang/rust/pull/79365 upgraded the coverage file + # format to one that only LLVM 11+ can use + llvm_version=11 + sudo apt-key add mk/llvm-snapshot.gpg.key + sudo add-apt-repository "deb http://apt.llvm.org/bionic/ llvm-toolchain-bionic-$llvm_version main" + sudo apt-get update + fi + install_packages clang-$llvm_version llvm-$llvm_version +fi diff --git a/mk/llvm-snapshot.gpg.key b/mk/llvm-snapshot.gpg.key new file mode 100644 index 0000000000..87a01ff889 --- /dev/null +++ b/mk/llvm-snapshot.gpg.key @@ -0,0 +1,54 @@ +-----BEGIN PGP PUBLIC KEY BLOCK----- +Version: GnuPG v1.4.12 (GNU/Linux) +Comment: See https://apt.llvm.org/. +Comment: Fingerprint: 6084 F3CF 814B 57C1 CF12 EFD5 15CF 4D18 AF4F 7421 + +mQINBFE9lCwBEADi0WUAApM/mgHJRU8lVkkw0CHsZNpqaQDNaHefD6Rw3S4LxNmM +EZaOTkhP200XZM8lVdbfUW9xSjA3oPldc1HG26NjbqqCmWpdo2fb+r7VmU2dq3NM +R18ZlKixiLDE6OUfaXWKamZsXb6ITTYmgTO6orQWYrnW6ckYHSeaAkW0wkDAryl2 +B5v8aoFnQ1rFiVEMo4NGzw4UX+MelF7rxaaregmKVTPiqCOSPJ1McC1dHFN533FY +Wh/RVLKWo6npu+owtwYFQW+zyQhKzSIMvNujFRzhIxzxR9Gn87MoLAyfgKEzrbbT +DhqqNXTxS4UMUKCQaO93TzetX/EBrRpJj+vP640yio80h4Dr5pAd7+LnKwgpTDk1 +G88bBXJAcPZnTSKu9I2c6KY4iRNbvRz4i+ZdwwZtdW4nSdl2792L7Sl7Nc44uLL/ +ZqkKDXEBF6lsX5XpABwyK89S/SbHOytXv9o4puv+65Ac5/UShspQTMSKGZgvDauU +cs8kE1U9dPOqVNCYq9Nfwinkf6RxV1k1+gwtclxQuY7UpKXP0hNAXjAiA5KS5Crq +7aaJg9q2F4bub0mNU6n7UI6vXguF2n4SEtzPRk6RP+4TiT3bZUsmr+1ktogyOJCc +Ha8G5VdL+NBIYQthOcieYCBnTeIH7D3Sp6FYQTYtVbKFzmMK+36ERreL/wARAQAB +tD1TeWx2ZXN0cmUgTGVkcnUgLSBEZWJpYW4gTExWTSBwYWNrYWdlcyA8c3lsdmVz +dHJlQGRlYmlhbi5vcmc+iQI4BBMBAgAiBQJRPZQsAhsDBgsJCAcDAgYVCAIJCgsE +FgIDAQIeAQIXgAAKCRAVz00Yr090Ibx+EADArS/hvkDF8juWMXxh17CgR0WZlHCC +9CTBWkg5a0bNN/3bb97cPQt/vIKWjQtkQpav6/5JTVCSx2riL4FHYhH0iuo4iAPR +udC7Cvg8g7bSPrKO6tenQZNvQm+tUmBHgFiMBJi92AjZ/Qn1Shg7p9ITivFxpLyX +wpmnF1OKyI2Kof2rm4BFwfSWuf8Fvh7kDMRLHv+MlnK/7j/BNpKdozXxLcwoFBmn +l0WjpAH3OFF7Pvm1LJdf1DjWKH0Dc3sc6zxtmBR/KHHg6kK4BGQNnFKujcP7TVdv +gMYv84kun14pnwjZcqOtN3UJtcx22880DOQzinoMs3Q4w4o05oIF+sSgHViFpc3W +R0v+RllnH05vKZo+LDzc83DQVrdwliV12eHxrMQ8UYg88zCbF/cHHnlzZWAJgftg +hB08v1BKPgYRUzwJ6VdVqXYcZWEaUJmQAPuAALyZESw94hSo28FAn0/gzEc5uOYx +K+xG/lFwgAGYNb3uGM5m0P6LVTfdg6vDwwOeTNIExVk3KVFXeSQef2ZMkhwA7wya +KJptkb62wBHFE+o9TUdtMCY6qONxMMdwioRE5BYNwAsS1PnRD2+jtlI0DzvKHt7B +MWd8hnoUKhMeZ9TNmo+8CpsAtXZcBho0zPGz/R8NlJhAWpdAZ1CmcPo83EW86Yq7 +BxQUKnNHcwj2ebkCDQRRPZQsARAA4jxYmbTHwmMjqSizlMJYNuGOpIidEdx9zQ5g +zOr431/VfWq4S+VhMDhs15j9lyml0y4ok215VRFwrAREDg6UPMr7ajLmBQGau0Fc +bvZJ90l4NjXp5p0NEE/qOb9UEHT7EGkEhaZ1ekkWFTWCgsy7rRXfZLxB6sk7pzLC +DshyW3zjIakWAnpQ5j5obiDy708pReAuGB94NSyb1HoW/xGsGgvvCw4r0w3xPStw +F1PhmScE6NTBIfLliea3pl8vhKPlCh54Hk7I8QGjo1ETlRP4Qll1ZxHJ8u25f/ta +RES2Aw8Hi7j0EVcZ6MT9JWTI83yUcnUlZPZS2HyeWcUj+8nUC8W4N8An+aNps9l/ +21inIl2TbGo3Yn1JQLnA1YCoGwC34g8QZTJhElEQBN0X29ayWW6OdFx8MDvllbBV +ymmKq2lK1U55mQTfDli7S3vfGz9Gp/oQwZ8bQpOeUkc5hbZszYwP4RX+68xDPfn+ +M9udl+qW9wu+LyePbW6HX90LmkhNkkY2ZzUPRPDHZANU5btaPXc2H7edX4y4maQa +xenqD0lGh9LGz/mps4HEZtCI5CY8o0uCMF3lT0XfXhuLksr7Pxv57yue8LLTItOJ +d9Hmzp9G97SRYYeqU+8lyNXtU2PdrLLq7QHkzrsloG78lCpQcalHGACJzrlUWVP/ +fN3Ht3kAEQEAAYkCHwQYAQIACQUCUT2ULAIbDAAKCRAVz00Yr090IbhWEADbr50X +OEXMIMGRLe+YMjeMX9NG4jxs0jZaWHc/WrGR+CCSUb9r6aPXeLo+45949uEfdSsB +pbaEdNWxF5Vr1CSjuO5siIlgDjmT655voXo67xVpEN4HhMrxugDJfCa6z97P0+ML +PdDxim57uNqkam9XIq9hKQaurxMAECDPmlEXI4QT3eu5qw5/knMzDMZj4Vi6hovL +wvvAeLHO/jsyfIdNmhBGU2RWCEZ9uo/MeerPHtRPfg74g+9PPfP6nyHD2Wes6yGd +oVQwtPNAQD6Cj7EaA2xdZYLJ7/jW6yiPu98FFWP74FN2dlyEA2uVziLsfBrgpS4l +tVOlrO2YzkkqUGrybzbLpj6eeHx+Cd7wcjI8CalsqtL6cG8cUEjtWQUHyTbQWAgG +5VPEgIAVhJ6RTZ26i/G+4J8neKyRs4vz+57UGwY6zI4AB1ZcWGEE3Bf+CDEDgmnP +LSwbnHefK9IljT9XU98PelSryUO/5UPw7leE0akXKB4DtekToO226px1VnGp3Bov +1GBGvpHvL2WizEwdk+nfk8LtrLzej+9FtIcq3uIrYnsac47Pf7p0otcFeTJTjSq3 +krCaoG4Hx0zGQG2ZFpHrSrZTVy6lxvIdfi0beMgY6h78p6M9eYZHQHc02DjFkQXN +bXb5c6gCHESH5PXwPU4jQEE7Ib9J6sbk7ZT2Mw== +=j+4q +-----END PGP PUBLIC KEY BLOCK----- diff --git a/mk/package.sh b/mk/package.sh index 43b9851272..7def4b623e 100644 --- a/mk/package.sh +++ b/mk/package.sh @@ -1,4 +1,3 @@ - # This only works on Windows, using MinGW. set -eux -o pipefail IFS=$'\n\t' diff --git a/mk/runner b/mk/runner new file mode 100755 index 0000000000..ffa1441084 --- /dev/null +++ b/mk/runner @@ -0,0 +1,21 @@ +#!/bin/bash +set -eux -o pipefail +IFS=$'\n\t' + +for arg in $*; do + # There can be some arguments prefixed in front of the executable, e.g. + # when qemu-user is used. There can be arguments after the executable, + # e.g. `cargo test` arguments like `TESTNAME`. + if [[ $arg = */deps/* ]]; then + executable=$arg + break + fi +done + +export LLVM_PROFILE_FILE=$(dirname "$RING_BUILD_EXECUTABLE_LIST")/$(basename "$executable").profraw + +if [ -n "$RING_BUILD_EXECUTABLE_LIST" ]; then + echo "$executable" >> "$RING_BUILD_EXECUTABLE_LIST" +fi + +$* diff --git a/mk/travis-install-kcov.sh b/mk/travis-install-kcov.sh deleted file mode 100755 index af1dcbf5e5..0000000000 --- a/mk/travis-install-kcov.sh +++ /dev/null @@ -1,67 +0,0 @@ -#!/usr/bin/env bash -# -# Copyright (c) 2016 Pietro Monteiro -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -set -ex - - -# kcov 26 or newer is needed when getting coverage information for Rust. -# kcov 31 is needed so `kcov --version` doesn't exit with status 1. -KCOV_VERSION=${KCOV_VERSION:-36} - -KCOV_INSTALL_PREFIX="${HOME}/kcov-${TARGET_X}" - -# Check if kcov has been cached on travis. -if [[ -f "$KCOV_INSTALL_PREFIX/bin/kcov" ]]; then - KCOV_INSTALLED_VERSION=`$KCOV_INSTALL_PREFIX/bin/kcov --version` - # Exit if we don't need to upgrade kcov. - if [[ "$KCOV_INSTALLED_VERSION" == "kcov $KCOV_VERSION" ]]; then - echo "Using cached kcov version: ${KCOV_VERSION}" - exit 0 - else - rm -rf "$KCOV_INSTALL_PREFIX" - fi -fi - -curl -L https://github.com/SimonKagstrom/kcov/archive/v$KCOV_VERSION.tar.gz | tar -zxf - - -pushd kcov-$KCOV_VERSION - -mkdir build - -pushd build - -if [[ "$TARGET_X" == "i686-unknown-linux-gnu" ]]; then - # set PKG_CONFIG_PATH so the kcov build system uses the 32 bit libraries we installed. - # otherwise kcov will be linked with 64 bit libraries and won't work with 32 bit executables. - PKG_CONFIG_PATH="/usr/lib/i386-linux-gnu/pkgconfig" CFLAGS="-m32" \ - CXXFLAGS="-m32" TARGET=$TARGET_X \ - cmake -DCMAKE_INSTALL_PREFIX:PATH="${KCOV_INSTALL_PREFIX}" .. -else - TARGET=$TARGET_X cmake -DCMAKE_INSTALL_PREFIX:PATH="${KCOV_INSTALL_PREFIX}" .. -fi - -make -make install - -$KCOV_INSTALL_PREFIX/bin/kcov --version - -popd -popd diff --git a/mk/travis.sh b/mk/travis.sh deleted file mode 100755 index 6938114c46..0000000000 --- a/mk/travis.sh +++ /dev/null @@ -1,170 +0,0 @@ -#!/usr/bin/env bash -# -# Copyright 2015 Brian Smith. -# -# Permission to use, copy, modify, and/or distribute this software for any -# purpose with or without fee is hereby granted, provided that the above -# copyright notice and this permission notice appear in all copies. -# -# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES -# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF -# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY -# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES -# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION -# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN -# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - -set -eux -o pipefail -IFS=$'\n\t' - -printenv - -case $TARGET_X in -aarch64-unknown-linux-gnu) - export QEMU_LD_PREFIX=/usr/aarch64-linux-gnu - ;; -arm-unknown-linux-gnueabihf) - export QEMU_LD_PREFIX=/usr/arm-linux-gnueabihf - ;; -aarch64-linux-android) - # XXX: Tests are built but not run because we couldn't get the emulator to work; see - # https://github.com/briansmith/ring/issues/838 - export ANDROID_ABI=aarch64 - ;; -armv7-linux-androideabi) - # XXX: Tests are built but not run because we couldn't get the emulator to work; see - # https://github.com/briansmith/ring/issues/838 - # export ANDROID_SYSTEM_IMAGE="system-images;android-18;default;armeabi-v7a" - export ANDROID_ABI=armeabi-v7a - ;; -esac - -if [[ ! -z "${ANDROID_ABI-}" ]]; then - # install the android sdk/ndk - mkdir "$ANDROID_HOME/licenses" || true - echo "24333f8a63b6825ea9c5514f83c2829b004d1fee" > "$ANDROID_HOME/licenses/android-sdk-license" - sdkmanager ndk-bundle - curl -sSf https://build.travis-ci.org/files/rustup-init.sh | sh -s -- --default-toolchain=$RUST_X -y - export PATH=$HOME/.cargo/bin:$ANDROID_HOME/ndk-bundle/toolchains/llvm/prebuilt/linux-x86_64/bin:$PATH - rustup default -fi - -if [[ "$TARGET_X" =~ ^(arm|aarch64) && ! "$TARGET_X" =~ android ]]; then - # We need a newer QEMU than Travis has. - # sudo is needed until the PPA and its packages are whitelisted. - # See https://github.com/travis-ci/apt-source-whitelist/issues/271 - sudo add-apt-repository ppa:pietro-monteiro/qemu-backport -y - sudo apt-get update -qq - sudo apt-get install --no-install-recommends binfmt-support qemu-user-binfmt -y -fi - -if [[ ! "$TARGET_X" =~ "x86_64-" ]]; then - rustup target add "$TARGET_X" - - # By default cargo/rustc seems to use cc for linking, We installed the - # multilib support that corresponds to $CC_X but unless cc happens to match - # $CC_X, that's not the right version. The symptom is a linker error - # where it fails to find -lgcc_s. - if [[ ! -z "${CC_X-}" ]]; then - mkdir .cargo - echo "[target.$TARGET_X]" > .cargo/config - echo "linker= \"$CC_X\"" >> .cargo/config - cat .cargo/config - fi -fi - -if [[ ! -z "${CC_X-}" ]]; then - export CC=$CC_X - $CC --version -else - cc --version -fi - -# KCOV needs a C++ compiler. -if [[ "$KCOV" == "1" ]]; then - if [[ ! -z "${CC_X-}" ]]; then - CXX="${CC_X/clang/clang++}" - CXX="${CC_X/gcc/g++}" - export CXX=$CXX - $CXX --version - else - c++ --version - fi -fi - -cargo version -rustc --version - -if [[ "$MODE_X" == "RELWITHDEBINFO" ]]; then - mode=--release - target_dir=target/$TARGET_X/release -else - target_dir=target/$TARGET_X/debug -fi - -if [[ -z "${ANDROID_ABI-}" ]]; then - cargo test -vv -j2 ${mode-} ${FEATURES_X-} --target=$TARGET_X -else - cargo test -vv -j2 --no-run ${mode-} ${FEATURES_X-} --target=$TARGET_X - - if [[ ! -z "${ANDROID_SYSTEM_IMAGE-}" ]]; then - # Building the AVD is slow. Do it here, after we build the code so that any - # build breakage is reported sooner, instead of being delayed by this. - sdkmanager tools - echo no | avdmanager create avd --force --name $ANDROID_ABI -k $ANDROID_SYSTEM_IMAGE --abi $ANDROID_ABI - avdmanager list avd - - $ANDROID_HOME/emulator/emulator @$ANDROID_ABI -memory 2048 -no-skin -no-boot-anim -no-window & - adb wait-for-device - - # Run the unit tests first. The file named ring- in $target_dir is - # the test executable. - - find $target_dir -maxdepth 1 -name ring-* ! -name "*.*" \ - -exec adb push {} /data/ring-test \; - adb shell "cd /data && ./ring-test" 2>&1 | tee /tmp/ring-test-log - grep "test result: ok" /tmp/ring-test-log - - for test_exe in `find $target_dir -maxdepth 1 -name "*test*" -type f ! -name "*.*" `; do - adb push $test_exe /data/`basename $test_exe` - adb shell "cd /data && ./`basename $test_exe`" 2>&1 | \ - tee /tmp/`basename $test_exe`-log - grep "test result: ok" /tmp/`basename $test_exe`-log - done - - adb emu kill - fi -fi - -if [[ "$KCOV" == "1" ]]; then - # kcov reports coverage as a percentage of code *linked into the executable* - # (more accurately, code that has debug info linked into the executable), not - # as a percentage of source code. Thus, any code that gets discarded by the - # linker due to lack of usage isn't counted at all. Thus, we have to re-link - # with "-C link-dead-code" to get accurate code coverage reports. - # Alternatively, we could link pass "-C link-dead-code" in the "cargo test" - # step above, but then "cargo test" we wouldn't be testing the configuration - # we expect people to use in production. - # - # panic=abort is used to get accurate coverage. See - # https://github.com/rust-lang/rust/issues/43410 and - # https://github.com/mozilla/grcov/issues/427#issuecomment-623995594 and - # https://github.com/rust-lang/rust/issues/55352. - cargo clean - CARGO_INCREMENTAL=0 \ - RUSTDOCFLAGS="-Cpanic=abort" \ - RUSTFLAGS="-Ccodegen-units=1 -Clink-dead-code -Coverflow-checks=on -Cpanic=abort -Zpanic_abort_tests -Zprofile" \ - cargo test -vv --no-run -j2 ${mode-} ${FEATURES_X-} --target=$TARGET_X - mk/travis-install-kcov.sh - for test_exe in `find target/$TARGET_X/debug -maxdepth 1 -executable -type f`; do - ${HOME}/kcov-${TARGET_X}/bin/kcov \ - --verify \ - --coveralls-id=$TRAVIS_JOB_ID \ - --exclude-path=/usr/include \ - --include-pattern="ring/crypto,ring/src,ring/tests" \ - target/kcov \ - $test_exe - done -fi - -echo end of mk/travis.sh diff --git a/mk/update-travis-yml.py b/mk/update-travis-yml.py deleted file mode 100755 index 4468484111..0000000000 --- a/mk/update-travis-yml.py +++ /dev/null @@ -1,285 +0,0 @@ -# Run this as "python mk/update-travis-yml.py" - -# Copyright 2015 Brian Smith. -# -# Permission to use, copy, modify, and/or distribute this software for any -# purpose with or without fee is hereby granted, provided that the above -# copyright notice and this permission notice appear in all copies. -# -# THE SOFTWARE IS PROVIDED "AS IS" AND BRIAN SMITH AND THE AUTHORS DISCLAIM -# ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES -# OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL BRIAN SMITH OR THE AUTHORS -# BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY -# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN -# AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF -# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - -import re -import shutil - -rusts = [ - "stable", - "nightly", - "beta", -] - -gcc = "gcc-7" -#Clang 5.0 is the default compiler on Travis CI for Ubuntu 14.04. -clang = "clang" - -linux_compilers = [ - # Assume the default compiler is GCC. - # GCC 4.8 is the default compiler on Travis CI for Ubuntu 14.04. - "", - - clang, - - gcc, -] - -osx_compilers = [ - "", # Don't set CC.' -] - -compilers = { - "aarch64-unknown-linux-gnu" : [ "aarch64-linux-gnu-gcc" ], - "aarch64-linux-android" : [ "aarch64-linux-android21-clang" ], - "armv7-linux-androideabi" : [ "armv7a-linux-androideabi18-clang" ], - "arm-unknown-linux-gnueabihf" : [ "arm-linux-gnueabihf-gcc" ], - "i686-unknown-linux-gnu" : linux_compilers, - "x86_64-unknown-linux-gnu" : linux_compilers, - "x86_64-apple-darwin" : osx_compilers, -} - -feature_sets = [ - "", -] - -modes = [ - "DEBUG", - "RELWITHDEBINFO" -] - -# Mac OS X is first because we don't want to have to wait until all the Linux -# configurations have been built to find out that there is a failure on Mac. -oss = [ - "osx", - "linux", -] - -targets = { - "osx" : [ - "x86_64-apple-darwin", - ], - "linux" : [ - "aarch64-linux-android", - "armv7-linux-androideabi", - "x86_64-unknown-linux-gnu", - "aarch64-unknown-linux-gnu", - "i686-unknown-linux-gnu", - "arm-unknown-linux-gnueabihf", - ], -} - -def format_entries(): - return "\n".join([format_entry(os, target, compiler, rust, mode, features) - for rust in rusts - for os in oss - for target in targets[os] - for compiler in compilers[target] - for mode in modes - for features in feature_sets]) - -# We use alternative names (the "_X" suffix) so that, in mk/travis.sh, we can -# ensure that we set the specific variables we want and that no relevant -# variables are unintentially inherited into the build process. Also, we have -# to set |CC_X| instead of |CC| since Travis sets |CC| to its Travis CI default -# value *after* processing the |env:| directive here. -entry_template = """ - - env: TARGET_X=%(target)s %(compilers)s FEATURES_X=%(features)s MODE_X=%(mode)s KCOV=%(kcov)s RUST_X=%(rust)s - rust: %(rust)s - os: %(os)s""" - -entry_indent = " " - -entry_packages_template = """ - addons: - apt: - packages: - %(packages)s""" - -entry_sources_template = """ - sources: - %(sources)s""" - -def format_entry(os, target, compiler, rust, mode, features): - target_words = target.split("-") - arch = target_words[0] - vendor = target_words[1] - sys = target_words[2] - - # Currently kcov only runs on Linux. - # - # GCC 7 was picked arbitrarily to restrict coverage report to one build for - # efficiency reasons. - # - # DEBUG mode is needed because debug symbols are needed for coverage - # tracking. - kcov = (os == "linux" and compiler == gcc and rust == "nightly" and - mode == "DEBUG") - - template = entry_template - - if sys == "darwin": - abi = sys - sys = "macos" - elif sys == "androideabi": - abi = sys - sys = "linux" - template += """ - language: android - android: - components: - - android-18 - - build-tools-26.0.2 - - sys-img-armeabi-v7a-android-18""" - elif sys == "android": - abi = sys - sys = "linux" - template += """ - language: android - android: - components: - - android-21 - - build-tools-26.0.2""" - else: - abi = target_words[3] - - def prefix_all(prefix, xs): - return [prefix + x for x in xs] - - if sys == "linux": - packages = sorted(get_linux_packages_to_install(target, compiler, arch, kcov)) - sources_with_dups = sum([get_sources_for_package(p) for p in packages],[]) - sources = sorted(list(set(sources_with_dups))) - template += """ - dist: trusty""" - - if sys == "linux": - if packages: - template += entry_packages_template - if sources: - template += entry_sources_template - else: - packages = [] - sources = [] - - cc = compiler - - if os == "osx": - os += "\n" + entry_indent + "osx_image: xcode10.1" - - compilers = [] - if cc != "": - compilers += ["CC_X=" + cc] - compilers += "" - - return template % { - "compilers": " ".join(compilers), - "features" : features, - "mode" : mode, - "kcov": "1" if kcov == True else "0", - "packages" : "\n ".join(prefix_all("- ", packages)), - "rust" : rust, - "sources" : "\n ".join(prefix_all("- ", sources)), - "target" : target, - "os" : os, - } - -def get_linux_packages_to_install(target, compiler, arch, kcov): - if compiler.startswith("clang-") or compiler.startswith("gcc-"): - packages = [compiler] - else: - packages = [] - - if kcov: - packages += [replace_cc_with_cxx(compiler)] - - if target == "aarch64-unknown-linux-gnu": - packages += ["gcc-aarch64-linux-gnu", - "libc6-dev-arm64-cross"] - if target == "arm-unknown-linux-gnueabihf": - packages += ["gcc-arm-linux-gnueabihf", - "libc6-dev-armhf-cross"] - - if arch == "i686": - if kcov == True: - packages += [replace_cc_with_cxx(compiler) + "-multilib", - "libcurl3:i386", - "libcurl4-openssl-dev:i386", - "libdw-dev:i386", - "libelf-dev:i386", - "libiberty-dev:i386", - "libkrb5-dev:i386", - "libssl-dev:i386"] - - if compiler.startswith("clang") or compiler == "": - packages += ["libc6-dev-i386", - "gcc-multilib"] - elif compiler.startswith("gcc-"): - packages += [compiler + "-multilib", - "linux-libc-dev:i386"] - else: - raise ValueError("unexpected compiler: %s" % compiler) - elif arch == "x86_64": - if kcov == True: - packages += ["libcurl4-openssl-dev", - "libelf-dev", - "libdw-dev", - "binutils-dev", - "libiberty-dev"] - elif arch not in ["aarch64", "arm", "armv7"]: - raise ValueError("unexpected arch: %s" % arch) - - return packages - -def get_sources_for_package(package): - ubuntu_toolchain = "ubuntu-toolchain-r-test" - if package.startswith("clang-"): - _, version = package.split("-") - llvm_toolchain = "llvm-toolchain-trusty-%s" % version - - # Stuff in llvm-toolchain-trusty depends on stuff in the toolchain - # packages. - return [llvm_toolchain, ubuntu_toolchain] - elif package.startswith("gcc-"): - return [ubuntu_toolchain] - else: - return [] - -def replace_cc_with_cxx(compiler): - return compiler \ - .replace("gcc", "g++") \ - .replace("clang", "clang++") - -def main(): - # Make a backup of the file we are about to update. - shutil.copyfile(".travis.yml", ".travis.yml~") - with open(".travis.yml", "r+b") as file: - begin = " # BEGIN GENERATED\n" - end = " # END GENERATED\n" - old_contents = file.read() - new_contents = re.sub("%s(.*?)\n[ ]*%s" % (begin, end), - "".join([begin, format_entries(), "\n\n", end]), - old_contents, flags=re.S) - if old_contents == new_contents: - print "No changes" - return - - file.seek(0) - file.write(new_contents) - file.truncate() - print new_contents - -if __name__ == '__main__': - main() diff --git a/pregenerate_asm/Cargo.toml b/pregenerate_asm/Cargo.toml index a200e0a4c2..8a64273f8a 100644 --- a/pregenerate_asm/Cargo.toml +++ b/pregenerate_asm/Cargo.toml @@ -9,4 +9,4 @@ path = "../build.rs" # Keep this in sync with `[build-dependencies]` in ../Cargo.toml. [dependencies] -cc = "1.0.26" +cc = { version = "1.0.62", default-features = false } diff --git a/src/aead.rs b/src/aead.rs index 40d7f201b6..4d6bdb7904 100644 --- a/src/aead.rs +++ b/src/aead.rs @@ -398,7 +398,7 @@ impl core::fmt::Debug for UnboundKey { } } -#[allow(variant_size_differences)] +#[allow(clippy::large_enum_variant, variant_size_differences)] enum KeyInner { AesGcm(aes_gcm::Key), ChaCha20Poly1305(chacha20_poly1305::Key), @@ -635,7 +635,7 @@ impl Eq for Algorithm {} /// An authentication tag. #[must_use] #[repr(C)] -pub struct Tag(Block); +pub struct Tag([u8; TAG_LEN]); impl AsRef<[u8]> for Tag { fn as_ref(&self) -> &[u8] { diff --git a/src/aead/aes.rs b/src/aead/aes.rs index b029668fb9..28558eb35e 100644 --- a/src/aead/aes.rs +++ b/src/aead/aes.rs @@ -12,7 +12,7 @@ // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. -use super::{counter, iv::Iv, Block, Direction, BLOCK_LEN}; +use super::{counter, iv::Iv, quic::Sample, Block, Direction, BLOCK_LEN}; use crate::{bits::BitLength, c, cpu, endian::*, error, polyfill}; pub(crate) struct Key { @@ -152,7 +152,7 @@ impl Key { set_encrypt_key!(GFp_vpaes_set_encrypt_key, bytes, key_bits, &mut key)? } - #[cfg(all(not(target_arch = "aarch64"), not(target_env = "sgx")))] + #[cfg(not(target_arch = "aarch64"))] Implementation::NOHW => { set_encrypt_key!(GFp_aes_nohw_set_encrypt_key, bytes, key_bits, &mut key)? } @@ -183,7 +183,7 @@ impl Key { ))] Implementation::VPAES_BSAES => encrypt_block!(GFp_vpaes_encrypt, a, self), - #[cfg(all(not(target_arch = "aarch64"), not(target_env = "sgx")))] + #[cfg(not(target_arch = "aarch64"))] Implementation::NOHW => encrypt_block!(GFp_aes_nohw_encrypt, a, self), } } @@ -280,7 +280,7 @@ impl Key { }); } - #[cfg(all(not(target_arch = "aarch64"), not(target_env = "sgx")))] + #[cfg(not(target_arch = "aarch64"))] Implementation::NOHW => ctr32_encrypt_blocks!( GFp_aes_nohw_ctr32_encrypt_blocks, in_out, @@ -291,8 +291,8 @@ impl Key { } } - pub fn new_mask(&self, sample: Block) -> [u8; 5] { - let block = self.encrypt_block(sample); + pub fn new_mask(&self, sample: Sample) -> [u8; 5] { + let block = self.encrypt_block(Block::from(&sample)); let mut out: [u8; 5] = [0; 5]; out.copy_from_slice(&block.as_ref()[..5]); @@ -300,6 +300,10 @@ impl Key { out } + // TODO: use `matches!` when MSRV increases to 1.42.0 and remove this + // `#[allow(...)]` + #[allow(clippy::unknown_clippy_lints)] + #[allow(clippy::match_like_matches_macro)] #[cfg(target_arch = "x86_64")] #[must_use] pub fn is_aes_hw(&self) -> bool { @@ -340,7 +344,7 @@ pub enum Implementation { target_arch = "aarch64", target_arch = "arm", target_arch = "x86_64", - target_arch = "x86" + target_arch = "x86", ))] HWAES = 1, @@ -353,7 +357,7 @@ pub enum Implementation { ))] VPAES_BSAES = 2, - #[cfg(all(not(target_arch = "aarch64"), not(target_env = "sgx")))] + #[cfg(not(target_arch = "aarch64"))] NOHW = 3, } @@ -398,15 +402,10 @@ fn detect_implementation(cpu_features: cpu::Features) -> Implementation { Implementation::VPAES_BSAES } - #[cfg(all(not(target_arch = "aarch64"), not(target_env = "sgx")))] + #[cfg(not(target_arch = "aarch64"))] { Implementation::NOHW } - - #[cfg(target_env = "sgx")] - { - panic!("No AES implementation available!") - } } #[must_use] diff --git a/src/aead/aes_gcm.rs b/src/aead/aes_gcm.rs index 350e397be3..b225e76821 100644 --- a/src/aead/aes_gcm.rs +++ b/src/aead/aes_gcm.rs @@ -190,7 +190,7 @@ fn aead( let bytes = tag_iv.into_bytes_less_safe(); let mut tag = aes_key.encrypt_block(Block::from(&bytes)); tag.bitxor_assign(pre_tag.into()); - Tag(tag) + Tag(*tag.as_ref()) }) } diff --git a/src/aead/block.rs b/src/aead/block.rs index 658ac22d44..157f8ad842 100644 --- a/src/aead/block.rs +++ b/src/aead/block.rs @@ -87,6 +87,7 @@ impl From<&'_ [u8; BLOCK_LEN]> for Block { } impl AsRef<[u8; BLOCK_LEN]> for Block { + #[allow(clippy::transmute_ptr_to_ptr)] #[inline] fn as_ref(&self) -> &[u8; BLOCK_LEN] { unsafe { core::mem::transmute(self) } diff --git a/src/aead/chacha.rs b/src/aead/chacha.rs index cd6eed205f..5e015083b1 100644 --- a/src/aead/chacha.rs +++ b/src/aead/chacha.rs @@ -13,16 +13,16 @@ // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. -use super::{counter, iv::Iv, Block, BLOCK_LEN}; +use super::{counter, iv::Iv, quic::Sample, BLOCK_LEN}; use crate::{c, endian::*}; -#[repr(C)] -pub struct Key([u8; KEY_LEN]); +#[repr(transparent)] +pub struct Key([LittleEndian; KEY_LEN / 4]); impl From<[u8; KEY_LEN]> for Key { #[inline] fn from(value: [u8; KEY_LEN]) -> Self { - Self(value) + Self(FromByteArray::from_byte_array(&value)) } } @@ -52,9 +52,9 @@ impl Key { } #[inline] - pub fn new_mask(&self, sample: Block) -> [u8; 5] { + pub fn new_mask(&self, sample: Sample) -> [u8; 5] { let mut out: [u8; 5] = [0; 5]; - let iv = Iv::assume_unique_for_key(*sample.as_ref()); + let iv = Iv::assume_unique_for_key(sample); unsafe { self.encrypt( diff --git a/src/aead/chacha20_poly1305.rs b/src/aead/chacha20_poly1305.rs index 26d3315b27..1890648449 100644 --- a/src/aead/chacha20_poly1305.rs +++ b/src/aead/chacha20_poly1305.rs @@ -80,7 +80,7 @@ fn aead( Aad(aad): Aad<&[u8]>, in_out: &mut [u8], direction: Direction, - _todo: cpu::Features, + cpu_features: cpu::Features, ) -> Tag { let chacha20_key = match key { aead::KeyInner::ChaCha20Poly1305(key) => key, @@ -89,7 +89,7 @@ fn aead( let mut counter = Counter::zero(nonce); let mut ctx = { - let key = derive_poly1305_key(chacha20_key, counter.increment()); + let key = derive_poly1305_key(chacha20_key, counter.increment(), cpu_features); poly1305::Context::from_key(key) }; @@ -108,12 +108,12 @@ fn aead( } }; - ctx.update_block( + ctx.update( Block::from_u64_le( LittleEndian::from(polyfill::u64_from_usize(aad.len())), LittleEndian::from(polyfill::u64_from_usize(in_out_len)), - ), - poly1305::Pad::Pad, + ) + .as_ref(), ); ctx.finish() } @@ -123,20 +123,24 @@ fn poly1305_update_padded_16(ctx: &mut poly1305::Context, input: &[u8]) { let remainder_len = input.len() % BLOCK_LEN; let whole_len = input.len() - remainder_len; if whole_len > 0 { - ctx.update_blocks(&input[..whole_len]); + ctx.update(&input[..whole_len]); } if remainder_len > 0 { let mut block = Block::zero(); block.overwrite_part_at(0, &input[whole_len..]); - ctx.update_block(block, poly1305::Pad::Pad) + ctx.update(block.as_ref()) } } // Also used by chacha20_poly1305_openssh. -pub(super) fn derive_poly1305_key(chacha_key: &chacha::Key, iv: Iv) -> poly1305::Key { +pub(super) fn derive_poly1305_key( + chacha_key: &chacha::Key, + iv: Iv, + cpu_features: cpu::Features, +) -> poly1305::Key { let mut key_bytes = [0u8; 2 * BLOCK_LEN]; chacha_key.encrypt_iv_xor_blocks_in_place(iv, &mut key_bytes); - poly1305::Key::from(key_bytes) + poly1305::Key::new(key_bytes, cpu_features) } #[cfg(test)] diff --git a/src/aead/chacha20_poly1305_openssh.rs b/src/aead/chacha20_poly1305_openssh.rs index 656ca3bf4d..cb6f6913e4 100644 --- a/src/aead/chacha20_poly1305_openssh.rs +++ b/src/aead/chacha20_poly1305_openssh.rs @@ -32,7 +32,7 @@ use super::{ chacha::{self, *}, chacha20_poly1305::derive_poly1305_key, - poly1305, Nonce, Tag, + cpu, poly1305, Nonce, Tag, }; use crate::{constant_time, endian::*, error}; use core::convert::TryInto; @@ -46,7 +46,7 @@ impl SealingKey { /// Constructs a new `SealingKey`. pub fn new(key_material: &[u8; KEY_LEN]) -> SealingKey { SealingKey { - key: Key::new(key_material), + key: Key::new(key_material, cpu::features()), } } @@ -64,7 +64,8 @@ impl SealingKey { tag_out: &mut [u8; TAG_LEN], ) { let mut counter = make_counter(sequence_number); - let poly_key = derive_poly1305_key(&self.key.k_2, counter.increment()); + let poly_key = + derive_poly1305_key(&self.key.k_2, counter.increment(), self.key.cpu_features); { let (len_in_out, data_and_padding_in_out) = @@ -92,7 +93,7 @@ impl OpeningKey { /// Constructs a new `OpeningKey`. pub fn new(key_material: &[u8; KEY_LEN]) -> OpeningKey { OpeningKey { - key: Key::new(key_material), + key: Key::new(key_material, cpu::features()), } } @@ -131,7 +132,8 @@ impl OpeningKey { // We must verify the tag before decrypting so that // `ciphertext_in_plaintext_out` is unmodified if verification fails. // This is beyond what we guarantee. - let poly_key = derive_poly1305_key(&self.key.k_2, counter.increment()); + let poly_key = + derive_poly1305_key(&self.key.k_2, counter.increment(), self.key.cpu_features); verify(poly_key, ciphertext_in_plaintext_out, tag)?; let plaintext_in_ciphertext_out = &mut ciphertext_in_plaintext_out[PACKET_LENGTH_LEN..]; @@ -146,10 +148,11 @@ impl OpeningKey { struct Key { k_1: chacha::Key, k_2: chacha::Key, + cpu_features: cpu::Features, } impl Key { - fn new(key_material: &[u8; KEY_LEN]) -> Key { + fn new(key_material: &[u8; KEY_LEN], cpu_features: cpu::Features) -> Key { // The first half becomes K_2 and the second half becomes K_1. let (k_2, k_1) = key_material.split_at(chacha::KEY_LEN); let k_1: [u8; chacha::KEY_LEN] = k_1.try_into().unwrap(); @@ -157,6 +160,7 @@ impl Key { Key { k_1: chacha::Key::from(k_1), k_2: chacha::Key::from(k_2), + cpu_features, } } } diff --git a/src/aead/gcm.rs b/src/aead/gcm.rs index bd777236a5..0e5c668c7e 100644 --- a/src/aead/gcm.rs +++ b/src/aead/gcm.rs @@ -65,7 +65,7 @@ impl Key { } } - #[cfg(all(not(target_arch = "aarch64"), not(target_env = "sgx")))] + #[cfg(not(target_arch = "aarch64"))] Implementation::Fallback => { h_table.Htable[0] = gcm_nohw::init(h); } @@ -168,7 +168,7 @@ impl Context { } } - #[cfg(all(not(target_arch = "aarch64"), not(target_env = "sgx")))] + #[cfg(not(target_arch = "aarch64"))] Implementation::Fallback => { gcm_nohw::ghash(xi, h_table.Htable[0], input); } @@ -210,7 +210,7 @@ impl Context { } } - #[cfg(all(not(target_arch = "aarch64"), not(target_env = "sgx")))] + #[cfg(not(target_arch = "aarch64"))] Implementation::Fallback => { gcm_nohw::gmult(xi, h_table.Htable[0]); } @@ -228,7 +228,6 @@ impl Context { pub(super) fn is_avx2(&self, cpu_features: cpu::Features) -> bool { match detect_implementation(cpu_features) { Implementation::CLMUL => has_avx_movbe(self.cpu_features), - #[cfg(not(target_env = "sgx"))] _ => false, } } @@ -289,7 +288,7 @@ enum Implementation { #[cfg(any(target_arch = "aarch64", target_arch = "arm"))] NEON, - #[cfg(all(not(target_arch = "aarch64"), not(target_env = "sgx")))] + #[cfg(not(target_arch = "aarch64"))] Fallback, } @@ -331,18 +330,13 @@ fn detect_implementation(cpu_features: cpu::Features) -> Implementation { return Implementation::NEON; } - #[cfg(all(not(target_arch = "aarch64"), not(target_env = "sgx")))] + #[cfg(not(target_arch = "aarch64"))] { return Implementation::Fallback; } - - #[cfg(target_env = "sgx")] - { - panic!("No GCM implementation available!") - } } #[cfg(target_arch = "x86_64")] fn has_avx_movbe(cpu_features: cpu::Features) -> bool { - return cpu::intel::AVX.available(cpu_features) && cpu::intel::MOVBE.available(cpu_features); + cpu::intel::AVX.available(cpu_features) && cpu::intel::MOVBE.available(cpu_features) } diff --git a/src/aead/poly1305.rs b/src/aead/poly1305.rs index b0df82040d..a87f709a4d 100644 --- a/src/aead/poly1305.rs +++ b/src/aead/poly1305.rs @@ -15,155 +15,105 @@ // TODO: enforce maximum input length. -use super::{ - block::{Block, BLOCK_LEN}, - Tag, -}; -use crate::{bssl, c, error}; -use core::convert::TryInto; +use super::{block::BLOCK_LEN, Tag, TAG_LEN}; +use crate::{c, cpu}; /// A Poly1305 key. -pub struct Key([u8; KEY_LEN]); +pub(super) struct Key { + key_and_nonce: [u8; KEY_LEN], + cpu_features: cpu::Features, +} const KEY_LEN: usize = 2 * BLOCK_LEN; -impl From<[u8; KEY_LEN]> for Key { +impl Key { #[inline] - fn from(value: [u8; KEY_LEN]) -> Self { - Self(value) + pub(super) fn new(key_and_nonce: [u8; KEY_LEN], cpu_features: cpu::Features) -> Self { + Self { + key_and_nonce, + cpu_features, + } } } pub struct Context { - opaque: Opaque, - nonce: Nonce, - func: Funcs, + state: poly1305_state, + #[allow(dead_code)] + cpu_features: cpu::Features, } -/// The memory manipulated by the assembly. -#[repr(C, align(8))] -struct Opaque([u8; OPAQUE_LEN]); -const OPAQUE_LEN: usize = 192; +// Keep in sync with `poly1305_state` in GFp/poly1305.h. +// +// The C code, in particular the way the `poly1305_aligned_state` functions +// are used, is only correct when the state buffer is 64-byte aligned. +#[repr(C, align(64))] +struct poly1305_state([u8; OPAQUE_LEN]); +const OPAQUE_LEN: usize = 512; + +// Abstracts the dispatching logic that chooses the NEON implementation if and +// only if it would work. +macro_rules! dispatch { + ( $features:expr => + ( $f:ident | $neon_f:ident ) + ( $( $p:ident : $t:ty ),+ ) + ( $( $a:expr ),+ ) ) => { + match () { + // Apple's 32-bit ARM ABI is incompatible with the assembly code. + #[cfg(all(target_arch = "arm", not(target_vendor = "apple")))] + () if cpu::arm::NEON.available($features) => { + extern "C" { + fn $neon_f( $( $p : $t ),+ ); + } + unsafe { $neon_f( $( $a ),+ ) } + } + () => { + extern "C" { + fn $f( $( $p : $t ),+ ); + } + unsafe { $f( $( $a ),+ ) } + } + } + } +} impl Context { #[inline] - pub fn from_key(Key(key_and_nonce): Key) -> Self { - extern "C" { - fn GFp_poly1305_blocks( - state: &mut Opaque, - input: *const u8, - len: c::size_t, - should_pad: Pad, - ); - fn GFp_poly1305_emit(state: &mut Opaque, tag: &mut Tag, nonce: &Nonce); - } - - let (key, nonce) = key_and_nonce.split_at(BLOCK_LEN); - let key: [u8; BLOCK_LEN] = key.try_into().unwrap(); - let nonce: [u8; BLOCK_LEN] = nonce.try_into().unwrap(); - - let key = DerivedKey(key); - let nonce = Nonce(nonce); - + pub(super) fn from_key( + Key { + key_and_nonce, + cpu_features, + }: Key, + ) -> Self { let mut ctx = Self { - opaque: Opaque([0u8; OPAQUE_LEN]), - nonce, - func: Funcs { - blocks_fn: GFp_poly1305_blocks, - emit_fn: GFp_poly1305_emit, - }, + state: poly1305_state([0u8; OPAQUE_LEN]), + cpu_features, }; - // On some platforms `init()` doesn't initialize `funcs`. The - // return value of `init()` indicates whether it did or not. Since - // we already gave `func` a default value above, we can ignore the - // return value assuming `init()` doesn't change `func` if it chose - // not to initialize it. Note that this is different than what - // BoringSSL does. - let _ = init(&mut ctx.opaque, key, &mut ctx.func); + dispatch!( + cpu_features => + (GFp_poly1305_init | GFp_poly1305_init_neon) + (statep: &mut poly1305_state, key: &[u8; KEY_LEN]) + (&mut ctx.state, &key_and_nonce)); ctx } - pub fn update_block(&mut self, block: Block, pad: Pad) { - self.func.blocks(&mut self.opaque, block.as_ref(), pad); - } - - pub fn update_blocks(&mut self, input: &[u8]) { - debug_assert_eq!(input.len() % BLOCK_LEN, 0); - self.func.blocks(&mut self.opaque, input, Pad::Pad); + #[inline(always)] + pub fn update(&mut self, input: &[u8]) { + dispatch!( + self.cpu_features => + (GFp_poly1305_update | GFp_poly1305_update_neon) + (statep: &mut poly1305_state, input: *const u8, in_len: c::size_t) + (&mut self.state, input.as_ptr(), input.len())); } pub(super) fn finish(mut self) -> Tag { - self.func.emit(&mut self.opaque, &self.nonce) - } -} - -#[cfg(test)] -pub fn check_state_layout() { - let required_state_size = if cfg!(target_arch = "x86") { - // See comment above `_poly1305_init_sse2` in poly1305-x86.pl. - Some(4 * (5 + 1 + 4 + 2 + 4 * 9)) - } else if cfg!(target_arch = "x86_64") { - // See comment above `__poly1305_block` in poly1305-x86_64.pl. - Some(4 * (5 + 1 + 2 * 2 + 2 + 4 * 9)) - } else { - // TODO(davidben): Figure out the layout of the struct. For now, - // `OPAQUE_LEN` is taken from OpenSSL. - None - }; - - if let Some(required_state_size) = required_state_size { - assert!(core::mem::size_of::() >= required_state_size); - } -} - -#[repr(C)] -struct DerivedKey([u8; BLOCK_LEN]); - -/// This is *not* an "AEAD nonce"; it's a Poly1305-specific nonce. -#[repr(C)] -struct Nonce([u8; BLOCK_LEN]); - -#[repr(C)] -struct Funcs { - blocks_fn: - unsafe extern "C" fn(&mut Opaque, input: *const u8, input_len: c::size_t, should_pad: Pad), - emit_fn: unsafe extern "C" fn(&mut Opaque, &mut Tag, nonce: &Nonce), -} - -#[inline] -fn init(state: &mut Opaque, key: DerivedKey, func: &mut Funcs) -> Result<(), error::Unspecified> { - extern "C" { - fn GFp_poly1305_init_asm( - state: &mut Opaque, - key: &DerivedKey, - out_func: &mut Funcs, - ) -> bssl::Result; - } - Result::from(unsafe { GFp_poly1305_init_asm(state, &key, func) }) -} - -#[repr(u32)] -pub enum Pad { - AlreadyPadded = 0, - Pad = 1, -} - -impl Funcs { - #[inline] - fn blocks(&self, state: &mut Opaque, data: &[u8], should_pad: Pad) { - unsafe { - (self.blocks_fn)(state, data.as_ptr(), data.len(), should_pad); - } - } - - #[inline] - fn emit(&self, state: &mut Opaque, nonce: &Nonce) -> Tag { - let mut tag = Tag(Block::zero()); - unsafe { - (self.emit_fn)(state, &mut tag, nonce); - } + let mut tag = Tag([0u8; TAG_LEN]); + dispatch!( + self.cpu_features => + (GFp_poly1305_finish | GFp_poly1305_finish_neon) + (statep: &mut poly1305_state, mac: &mut [u8; TAG_LEN]) + (&mut self.state, &mut tag.0)); tag } } @@ -174,16 +124,7 @@ impl Funcs { /// poly1305 test vectors. pub(super) fn sign(key: Key, input: &[u8]) -> Tag { let mut ctx = Context::from_key(key); - let remainder_len = input.len() % BLOCK_LEN; - let full_blocks_len = input.len() - remainder_len; - let (full_blocks, remainder) = input.split_at(full_blocks_len); - ctx.update_blocks(full_blocks); - if remainder_len > 0 { - let mut bytes = [0; BLOCK_LEN]; - bytes[..remainder_len].copy_from_slice(remainder); - bytes[remainder_len] = 1; - ctx.update_block(Block::from(&bytes), Pad::AlreadyPadded); - } + ctx.update(input); ctx.finish() } @@ -193,21 +134,17 @@ mod tests { use crate::test; use core::convert::TryInto; - #[test] - pub fn test_state_layout() { - check_state_layout(); - } - // Adapted from BoringSSL's crypto/poly1305/poly1305_test.cc. #[test] pub fn test_poly1305() { + let cpu_features = cpu::features(); test::run(test_file!("poly1305_test.txt"), |section, test_case| { assert_eq!(section, ""); let key = test_case.consume_bytes("Key"); let key: &[u8; BLOCK_LEN * 2] = key.as_slice().try_into().unwrap(); let input = test_case.consume_bytes("Input"); let expected_mac = test_case.consume_bytes("MAC"); - let key = Key::from(*key); + let key = Key::new(*key, cpu_features); let Tag(actual_mac) = sign(key, &input); assert_eq!(expected_mac, actual_mac.as_ref()); diff --git a/src/aead/quic.rs b/src/aead/quic.rs index 7f0f0a795d..ac667aeda1 100644 --- a/src/aead/quic.rs +++ b/src/aead/quic.rs @@ -17,7 +17,7 @@ //! See draft-ietf-quic-tls. use crate::{ - aead::{aes, block::Block, chacha}, + aead::{aes, chacha}, cpu, error, hkdf, }; use core::convert::{TryFrom, TryInto}; @@ -28,7 +28,7 @@ pub struct HeaderProtectionKey { algorithm: &'static Algorithm, } -#[allow(variant_size_differences)] +#[allow(clippy::large_enum_variant, variant_size_differences)] enum KeyInner { Aes(aes::Key), ChaCha20(chacha::Key), @@ -63,9 +63,8 @@ impl HeaderProtectionKey { /// `sample` must be exactly `self.algorithm().sample_len()` bytes long. pub fn new_mask(&self, sample: &[u8]) -> Result<[u8; 5], error::Unspecified> { let sample = <&[u8; SAMPLE_LEN]>::try_from(sample)?; - let sample = Block::from(sample); - let out = (self.algorithm.new_mask)(&self.inner, sample); + let out = (self.algorithm.new_mask)(&self.inner, *sample); Ok(out) } @@ -78,11 +77,14 @@ impl HeaderProtectionKey { const SAMPLE_LEN: usize = super::TAG_LEN; +/// QUIC sample for new key masks +pub type Sample = [u8; SAMPLE_LEN]; + /// A QUIC Header Protection Algorithm. pub struct Algorithm { init: fn(key: &[u8], cpu_features: cpu::Features) -> Result, - new_mask: fn(key: &KeyInner, sample: Block) -> [u8; 5], + new_mask: fn(key: &KeyInner, sample: Sample) -> [u8; 5], key_len: usize, id: AlgorithmID, @@ -152,7 +154,7 @@ fn aes_init_256(key: &[u8], cpu_features: cpu::Features) -> Result [u8; 5] { +fn aes_new_mask(key: &KeyInner, sample: Sample) -> [u8; 5] { let aes_key = match key { KeyInner::Aes(key) => key, _ => unreachable!(), @@ -174,7 +176,7 @@ fn chacha20_init(key: &[u8], _todo: cpu::Features) -> Result [u8; 5] { +fn chacha20_new_mask(key: &KeyInner, sample: Sample) -> [u8; 5] { let chacha20_key = match key { KeyInner::ChaCha20(key) => key, _ => unreachable!(), diff --git a/src/agreement.rs b/src/agreement.rs index 4c1e803430..d116c8faab 100644 --- a/src/agreement.rs +++ b/src/agreement.rs @@ -63,7 +63,6 @@ // Model." use crate::{cpu, debug, ec, error, rand}; -use untrusted; pub use crate::ec::{ curve25519::x25519::X25519, diff --git a/src/arithmetic/bigint.rs b/src/arithmetic/bigint.rs index f563b66c51..620595effd 100644 --- a/src/arithmetic/bigint.rs +++ b/src/arithmetic/bigint.rs @@ -46,7 +46,6 @@ use core::{ marker::PhantomData, ops::{Deref, DerefMut}, }; -use untrusted; pub unsafe trait Prime {} @@ -86,7 +85,7 @@ impl Clone for BoxedLimbs { fn clone(&self) -> Self { Self { limbs: self.limbs.clone(), - m: self.m.clone(), + m: self.m, } } } @@ -137,7 +136,7 @@ impl BoxedLimbs { fn zero(width: Width) -> Self { Self { - limbs: vec![0; width.num_limbs].to_owned().into_boxed_slice(), + limbs: vec![0; width.num_limbs].into_boxed_slice(), m: PhantomData, } } @@ -264,6 +263,7 @@ impl Modulus { // n_mod_r = n % r. As explained in the documentation for `n0`, this is // done by taking the lowest `N0_LIMBS_USED` limbs of `n`. + #[allow(clippy::useless_conversion)] let n0 = { extern "C" { fn GFp_bn_neg_inv_mod_r_u64(n: u64) -> u64; @@ -389,7 +389,7 @@ impl Clone for Elem { fn clone(&self) -> Self { Self { limbs: self.limbs.clone(), - encoding: self.encoding.clone(), + encoding: self.encoding, } } } @@ -1169,7 +1169,7 @@ impl Nonnegative { return Err(error::Unspecified); } } - return Ok(()); + Ok(()) } } @@ -1397,7 +1397,6 @@ mod tests { use super::*; use crate::test; use alloc::format; - use untrusted; // Type-level representation of an arbitrary modulus. struct M {} @@ -1530,7 +1529,7 @@ mod tests { #[test] fn test_modulus_debug() { let (modulus, _) = Modulus::::from_be_bytes_with_bit_length(untrusted::Input::from( - &vec![0xff; LIMB_BYTES * MODULUS_MIN_LIMBS], + &[0xff; LIMB_BYTES * MODULUS_MIN_LIMBS], )) .unwrap(); assert_eq!("Modulus", format!("{:?}", modulus)); diff --git a/src/cpu.rs b/src/cpu.rs index 6ce9ef1b06..f81cb8b4bc 100644 --- a/src/cpu.rs +++ b/src/cpu.rs @@ -24,24 +24,23 @@ pub(crate) struct Features(()); #[inline(always)] pub(crate) fn features() -> Features { - // We don't do runtime feature detection on iOS. instead some features are - // assumed to be present; see `arm::Feature`. - #[cfg(all( - any( - target_arch = "aarch64", - target_arch = "arm", - target_arch = "x86", - target_arch = "x86_64", - target_env = "sgx" - ), - not(target_os = "ios") + // We don't do runtime feature detection on aarch64-apple-* as all AAarch64 + // features we use are available on every device since the first devices. + #[cfg(any( + target_arch = "x86", + target_arch = "x86_64", + target_env = "sgx", + all( + any(target_arch = "aarch64", target_arch = "arm"), + any(target_os = "android", target_os = "fuchsia", target_os = "linux") + ) ))] { static INIT: spin::Once<()> = spin::Once::new(); let () = INIT.call_once(|| { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { - #[cfg(all(target_env = "sgx", feature = "sgx"))] + #[cfg(target_env = "sgx")] { extern crate std; use std::is_x86_feature_detected; @@ -52,97 +51,97 @@ pub(crate) fn features() -> Features { let [l1edx, l1ecx, l7ebx, l7ecx] = unsafe { &mut GFp_ia32cap_P }; if is_x86_feature_detected!("aes") { - *l1ecx |= 1<<25; + *l1ecx |= 1 << 25; } if is_x86_feature_detected!("pclmulqdq") { - *l1ecx |= 1<<1; + *l1ecx |= 1 << 1; } if is_x86_feature_detected!("rdrand") { - *l1ecx |= 1<<30; + *l1ecx |= 1 << 30; } if is_x86_feature_detected!("rdseed") { - *l7ebx |= 1<<18; + *l7ebx |= 1 << 18; } if is_x86_feature_detected!("tsc") { - *l1edx |= 1<<4; + *l1edx |= 1 << 4; } if is_x86_feature_detected!("mmx") { - *l1edx |= 1<<23; + *l1edx |= 1 << 23; } if is_x86_feature_detected!("sse") { - *l1edx |= 1<<25; + *l1edx |= 1 << 25; } if is_x86_feature_detected!("sse2") { - *l1edx |= 1<<26; + *l1edx |= 1 << 26; } if is_x86_feature_detected!("sse3") { - *l1ecx |= 1<<0; + *l1ecx |= 1 << 0; } if is_x86_feature_detected!("ssse3") { - *l1ecx |= 1<<9; + *l1ecx |= 1 << 9; } if is_x86_feature_detected!("sse4.1") { - *l1ecx |= 1<<19; + *l1ecx |= 1 << 19; } if is_x86_feature_detected!("sse4.2") { - *l1ecx |= 1<<20; + *l1ecx |= 1 << 20; } if is_x86_feature_detected!("sha") { - *l7ebx |= 1<<29; + *l7ebx |= 1 << 29; } if is_x86_feature_detected!("avx") { - *l1ecx |= 1<<28; + *l1ecx |= 1 << 28; } if is_x86_feature_detected!("avx2") { - *l7ebx |= 1<<5; + *l7ebx |= 1 << 5; } if is_x86_feature_detected!("avx512f") { - *l7ebx |= 1<<16; + *l7ebx |= 1 << 16; } if is_x86_feature_detected!("avx512cd") { - *l7ebx |= 1<<28; + *l7ebx |= 1 << 28; } if is_x86_feature_detected!("avx512er") { - *l7ebx |= 1<<27; + *l7ebx |= 1 << 27; } if is_x86_feature_detected!("avx512pf") { - *l7ebx |= 1<<26; + *l7ebx |= 1 << 26; } if is_x86_feature_detected!("avx512bw") { - *l7ebx |= 1<<30; + *l7ebx |= 1 << 30; } if is_x86_feature_detected!("avx512dq") { - *l7ebx |= 1<<17; + *l7ebx |= 1 << 17; } if is_x86_feature_detected!("avx512vl") { - *l7ebx |= 1<<31; + *l7ebx |= 1 << 31; } if is_x86_feature_detected!("avx512ifma") { - *l7ebx |= 1<<21; + *l7ebx |= 1 << 21; } if is_x86_feature_detected!("avx512vbmi") { - *l7ecx |= 1<<1; + *l7ecx |= 1 << 1; } if is_x86_feature_detected!("avx512vpopcntdq") { - *l7ecx |= 1<<14; + *l7ecx |= 1 << 14; } if is_x86_feature_detected!("fma") { - *l1ecx |= 1<<12; + *l1ecx |= 1 << 12; } if is_x86_feature_detected!("bmi1") { - *l7ebx |= 1<<3; + *l7ebx |= 1 << 3; } if is_x86_feature_detected!("bmi2") { - *l7ebx |= 1<<8; + *l7ebx |= 1 << 8; } if is_x86_feature_detected!("popcnt") { - *l1ecx |= 1<<23; + *l1ecx |= 1 << 23; } if is_x86_feature_detected!("fxsr") { - *l1edx |= 1<<24; + *l1edx |= 1 << 24; } if is_x86_feature_detected!("xsave") { - *l1ecx |= 1<<26; + *l1ecx |= 1 << 26; } /* will be stable on 1.33.0 if is_x86_feature_detected!("cmpxchg16b") { @@ -155,14 +154,14 @@ pub(crate) fn features() -> Features { // Rust can't detect the MOVBE feature yet, but it's widely // available. - *l1ecx |= 1<<22; + *l1ecx |= 1 << 22; // This bit is reserved in the CPUID specification, but the // BoringSSL detection code uses it to represent that this // is an Intel CPU. However, this bit is only used in // conjunction with the AVX bit to test for presence of // AVX, thus serving no purpose. Always set it. - *l1edx |= 1<<30; + *l1edx |= 1 << 30; // Features that don't map to leaf 1 or leaf 7: // Leaf 0xd: @@ -175,7 +174,6 @@ pub(crate) fn features() -> Features { // * lzcnt // * tbm } - #[cfg(not(target_env = "sgx"))] { @@ -189,16 +187,11 @@ pub(crate) fn features() -> Features { } #[cfg(all( - any(target_os = "android", target_os = "linux"), - any(target_arch = "aarch64", target_arch = "arm") + any(target_arch = "aarch64", target_arch = "arm"), + any(target_os = "android", target_os = "fuchsia", target_os = "linux") ))] { - arm::linux_setup(); - } - - #[cfg(all(target_os = "fuchsia", any(target_arch = "aarch64")))] - { - arm::fuchsia_setup(); + arm::setup(); } }); } @@ -211,7 +204,7 @@ pub(crate) mod arm { any(target_os = "android", target_os = "linux"), any(target_arch = "aarch64", target_arch = "arm") ))] - pub fn linux_setup() { + pub fn setup() { use libc::c_ulong; // XXX: The `libc` crate doesn't provide `libc::getauxval` consistently @@ -263,15 +256,15 @@ pub(crate) mod arm { features |= PMULL.mask; } if caps & HWCAP_SHA2 == HWCAP_SHA2 { - features |= 1 << 4; + features |= SHA256.mask; } unsafe { GFp_armcap_P = features }; } } - #[cfg(all(target_os = "fuchsia", any(target_arch = "aarch64")))] - pub fn fuchsia_setup() { + #[cfg(all(target_os = "fuchsia", target_arch = "aarch64"))] + pub fn setup() { type zx_status_t = i32; #[link(name = "zircon")] @@ -308,82 +301,139 @@ pub(crate) mod arm { } } - #[cfg(not(target_arch = "wasm32"))] + macro_rules! features { + { + $( + $name:ident { + mask: $mask:expr, + + /// Should we assume that the feature is always available + /// for aarch64-apple-* targets? The first AArch64 iOS + /// device used the Apple A7 chip. + // TODO: When we can use `if` in const expressions: + // ``` + // aarch64_apple: $aarch64_apple, + // ``` + aarch64_apple: true, + } + ),+ + , // trailing comma is required. + } => { + $( + #[allow(dead_code)] + pub(crate) const $name: Feature = Feature { + mask: $mask, + }; + )+ + + // TODO: When we can use `if` in const expressions, do this: + // ``` + // const ARMCAP_STATIC: u32 = 0 + // $( + // | ( if $aarch64_apple && + // cfg!(all(target_arch = "aarch64", + // target_vendor = "apple")) { + // $name.mask + // } else { + // 0 + // } + // ) + // )+; + // ``` + // + // TODO: Add static feature detection to other targets. + // TODO: Combine static feature detection with runtime feature + // detection. + #[cfg(all(target_arch = "aarch64", target_vendor = "apple"))] + const ARMCAP_STATIC: u32 = 0 + $( | $name.mask + )+; + #[cfg(not(all(target_arch = "aarch64", target_vendor = "apple")))] + const ARMCAP_STATIC: u32 = 0; + + #[cfg(all(target_arch = "aarch64", target_vendor = "apple"))] + #[test] + fn test_armcap_static_available() { + let features = crate::cpu::features(); + $( + assert!($name.available(features)); + )+ + } + } + } + + #[allow(dead_code)] pub(crate) struct Feature { - #[cfg_attr( - any( - target_os = "ios", - not(any(target_arch = "arm", target_arch = "aarch64")) - ), - allow(dead_code) - )] mask: u32, - - #[cfg_attr(not(target_os = "ios"), allow(dead_code))] - ios: bool, } - #[cfg(not(target_arch = "wasm32"))] impl Feature { + #[allow(dead_code)] #[inline(always)] pub fn available(&self, _: super::Features) -> bool { - #[cfg(all(target_os = "ios", any(target_arch = "arm", target_arch = "aarch64")))] - { - return self.ios; + if self.mask == self.mask & ARMCAP_STATIC { + return true; } #[cfg(all( - any(target_os = "android", target_os = "linux", target_os = "fuchsia"), + any(target_os = "android", target_os = "fuchsia", target_os = "linux"), any(target_arch = "arm", target_arch = "aarch64") ))] { - return self.mask == self.mask & unsafe { GFp_armcap_P }; + if self.mask == self.mask & unsafe { GFp_armcap_P } { + return true; + } } - #[cfg(not(any(target_arch = "arm", target_arch = "aarch64")))] - { - return false; - } + false } } - // Keep in sync with `ARMV7_NEON`. - #[cfg(any(target_arch = "aarch64", target_arch = "arm"))] - pub(crate) const NEON: Feature = Feature { - mask: 1 << 0, - ios: true, - }; - - // Keep in sync with `ARMV8_AES`. - #[cfg(any( - target_arch = "aarch64", - target_arch = "arm", - target_arch = "x86", - target_arch = "x86_64" - ))] - pub(crate) const AES: Feature = Feature { - mask: 1 << 2, - ios: true, - }; + features! { + // Keep in sync with `ARMV7_NEON`. + NEON { + mask: 1 << 0, + aarch64_apple: true, + }, + + // Keep in sync with `ARMV8_AES`. + AES { + mask: 1 << 2, + aarch64_apple: true, + }, + + // Keep in sync with `ARMV8_SHA256`. + SHA256 { + mask: 1 << 4, + aarch64_apple: true, + }, + + // Keep in sync with `ARMV8_PMULL`. + PMULL { + mask: 1 << 5, + aarch64_apple: true, + }, + } - // Keep in sync with `ARMV8_PMULL`. - #[cfg(any( - target_arch = "aarch64", - target_arch = "arm", - target_arch = "x86", - target_arch = "x86_64" - ))] - pub(crate) const PMULL: Feature = Feature { - mask: 1 << 5, - ios: true, - }; + // Some non-Rust code still checks this even when it is statically known + // the given feature is available, so we have to ensure that this is + // initialized properly. Keep this in sync with the initialization in + // BoringSSL's crypto.c. + // + // TODO: This should have "hidden" visibility but we don't have a way of + // controlling that yet: https://github.com/rust-lang/rust/issues/73958. + #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] + #[no_mangle] + static mut GFp_armcap_P: u32 = ARMCAP_STATIC; #[cfg(all( - any(target_os = "android", target_os = "linux", target_os = "fuchsia"), - any(target_arch = "arm", target_arch = "aarch64") + any(target_arch = "arm", target_arch = "aarch64"), + target_vendor = "apple" ))] - extern "C" { - static mut GFp_armcap_P: u32; + #[test] + fn test_armcap_static_matches_armcap_dynamic() { + assert_eq!(ARMCAP_STATIC, 1 | 4 | 16 | 32); + assert_eq!(ARMCAP_STATIC, unsafe { GFp_armcap_P }); } } @@ -398,6 +448,7 @@ pub(crate) mod intel { } impl Feature { + #[allow(clippy::needless_return)] #[inline(always)] pub fn available(&self, _: super::Features) -> bool { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] diff --git a/src/ec/curve25519/ed25519/signing.rs b/src/ec/curve25519/ed25519/signing.rs index b89e0db66a..3b522e8191 100644 --- a/src/ec/curve25519/ed25519/signing.rs +++ b/src/ec/curve25519/ed25519/signing.rs @@ -22,7 +22,6 @@ use crate::{ signature::{self, KeyPair as SigningKeyPair}, }; use core::convert::TryInto; -use untrusted; /// An Ed25519 key pair, for signing. pub struct Ed25519KeyPair { @@ -43,10 +42,11 @@ impl Ed25519KeyPair { /// PKCS#8 document. /// /// The PKCS#8 document will be a v2 `OneAsymmetricKey` with the public key, - /// as described in [RFC 5958 Section 2]. See also - /// https://tools.ietf.org/html/draft-ietf-curdle-pkix-04. + /// as described in [RFC 5958 Section 2]; see [RFC 8410 Section 10.3] for an + /// example. /// /// [RFC 5958 Section 2]: https://tools.ietf.org/html/rfc5958#section-2 + /// [RFC 8410 Section 10.3]: https://tools.ietf.org/html/rfc8410#section-10.3 pub fn generate_pkcs8( rng: &dyn rand::SecureRandom, ) -> Result { diff --git a/src/ec/curve25519/ed25519/verification.rs b/src/ec/curve25519/ed25519/verification.rs index 6d082e093f..e0c1b652fa 100644 --- a/src/ec/curve25519/ed25519/verification.rs +++ b/src/ec/curve25519/ed25519/verification.rs @@ -17,7 +17,6 @@ use super::{super::ops::*, eddsa_digest}; use crate::{error, sealed, signature}; use core::convert::TryInto; -use untrusted; /// Parameters for EdDSA signing and verification. pub struct EdDSAParameters; diff --git a/src/ec/curve25519/x25519.rs b/src/ec/curve25519/x25519.rs index 44ee17f4a9..53a2a5cf84 100644 --- a/src/ec/curve25519/x25519.rs +++ b/src/ec/curve25519/x25519.rs @@ -17,7 +17,6 @@ use super::{ops, scalar::SCALAR_LEN}; use crate::{agreement, constant_time, cpu, ec, error, rand}; use core::convert::TryInto; -use untrusted; static CURVE25519: ec::Curve = ec::Curve { public_key_len: PUBLIC_KEY_LEN, diff --git a/src/ec/suite_b.rs b/src/ec/suite_b.rs index caa5b3f1df..9e363563b8 100644 --- a/src/ec/suite_b.rs +++ b/src/ec/suite_b.rs @@ -16,7 +16,6 @@ use self::ops::*; use crate::{arithmetic::montgomery::*, cpu, ec, error, io::der, limb::LimbMask, pkcs8}; -use untrusted; // NIST SP 800-56A Step 3: "If q is an odd prime p, verify that // yQ**2 = xQ**3 + axQ + b in GF(p), where the arithmetic is performed modulo diff --git a/src/ec/suite_b/curve.rs b/src/ec/suite_b/curve.rs index 0788e10712..e0ff4f4617 100644 --- a/src/ec/suite_b/curve.rs +++ b/src/ec/suite_b/curve.rs @@ -31,7 +31,7 @@ macro_rules! suite_b_curve { /// [NIST Special Publication 800-56A, revision 2]: /// http://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-56Ar2.pdf /// [Suite B Implementer's Guide to NIST SP 800-56A]: - /// https://github.com/briansmith/ring/blob/master/doc/ecdh.pdf + /// https://github.com/briansmith/ring/blob/main/doc/ecdh.pdf pub static $NAME: ec::Curve = ec::Curve { public_key_len: 1 + (2 * (($bits + 7) / 8)), elem_scalar_seed_len: ($bits + 7) / 8, diff --git a/src/ec/suite_b/ecdh.rs b/src/ec/suite_b/ecdh.rs index f8680ccad6..aae31d2369 100644 --- a/src/ec/suite_b/ecdh.rs +++ b/src/ec/suite_b/ecdh.rs @@ -16,7 +16,6 @@ use super::{ops::*, private_key::*, public_key::*}; use crate::{agreement, ec, error}; -use untrusted; /// A key agreement algorithm. macro_rules! ecdh { @@ -38,7 +37,7 @@ macro_rules! ecdh { /// [NIST Special Publication 800-56A, revision 2]: /// http://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-56Ar2.pdf /// [Suite B Implementer's Guide to NIST SP 800-56A]: - /// https://github.com/briansmith/ring/blob/master/doc/ecdh.pdf + /// https://github.com/briansmith/ring/blob/main/doc/ecdh.pdf pub static $NAME: agreement::Algorithm = agreement::Algorithm { curve: $curve, ecdh: $ecdh, diff --git a/src/ec/suite_b/ecdsa/digest_scalar.rs b/src/ec/suite_b/ecdsa/digest_scalar.rs index 1f885c1a92..133e7da5df 100644 --- a/src/ec/suite_b/ecdsa/digest_scalar.rs +++ b/src/ec/suite_b/ecdsa/digest_scalar.rs @@ -19,7 +19,6 @@ use crate::{ ec::suite_b::ops::*, limb::{self, LIMB_BYTES}, }; -use untrusted; /// Calculate the digest of `msg` using the digest algorithm `digest_alg`. Then /// convert the digest to a scalar in the range [0, n) as described in @@ -84,7 +83,6 @@ mod tests { limb::{self, LIMB_BYTES}, test, }; - use untrusted; #[test] fn test() { diff --git a/src/ec/suite_b/ecdsa/signing.rs b/src/ec/suite_b/ecdsa/signing.rs index ca2bb39b05..5422e06b16 100644 --- a/src/ec/suite_b/ecdsa/signing.rs +++ b/src/ec/suite_b/ecdsa/signing.rs @@ -26,8 +26,6 @@ use crate::{ io::der, limb, pkcs8, rand, sealed, signature, }; -use untrusted; - /// An ECDSA signing algorithm. pub struct EcdsaSigningAlgorithm { curve: &'static ec::Curve, diff --git a/src/ec/suite_b/ecdsa/verification.rs b/src/ec/suite_b/ecdsa/verification.rs index 2b4ccbed69..be551e695d 100644 --- a/src/ec/suite_b/ecdsa/verification.rs +++ b/src/ec/suite_b/ecdsa/verification.rs @@ -23,7 +23,6 @@ use crate::{ io::der, limb, sealed, signature, }; -use untrusted; /// An ECDSA verification algorithm. pub struct EcdsaVerificationAlgorithm { diff --git a/src/ec/suite_b/ops.rs b/src/ec/suite_b/ops.rs index 6bcb8a4bb5..13d80c0d2e 100644 --- a/src/ec/suite_b/ops.rs +++ b/src/ec/suite_b/ops.rs @@ -14,7 +14,6 @@ use crate::{arithmetic::montgomery::*, c, error, limb::*}; use core::marker::PhantomData; -use untrusted; pub use self::elem::*; @@ -441,7 +440,6 @@ mod tests { use super::*; use crate::test; use alloc::{format, vec, vec::Vec}; - use untrusted; const ZERO_SCALAR: Scalar = Scalar { limbs: [0; MAX_LIMBS], @@ -491,11 +489,11 @@ mod tests { let b = consume_elem(cops, test_case, "b"); let expected_sum = consume_elem(cops, test_case, "r"); - let mut actual_sum = a.clone(); + let mut actual_sum = a; ops.public_key_ops.common.elem_add(&mut actual_sum, &b); assert_limbs_are_equal(cops, &actual_sum.limbs, &expected_sum.limbs); - let mut actual_sum = b.clone(); + let mut actual_sum = b; ops.public_key_ops.common.elem_add(&mut actual_sum, &a); assert_limbs_are_equal(cops, &actual_sum.limbs, &expected_sum.limbs); @@ -692,10 +690,10 @@ mod tests { test::run(test_file, |section, test_case| { assert_eq!(section, ""); let cops = ops.common; - let mut a = consume_scalar(cops, test_case, "a"); + let a = consume_scalar(cops, test_case, "a"); let b = consume_scalar_mont(cops, test_case, "b"); let expected_result = consume_scalar(cops, test_case, "r"); - let actual_result = ops.scalar_product(&mut a, &b); + let actual_result = ops.scalar_product(&a, &b); assert_limbs_are_equal(cops, &actual_result.limbs, &expected_result.limbs); Ok(()) @@ -1072,7 +1070,7 @@ mod tests { p } - fn consume_point_elem(ops: &CommonOps, limbs_out: &mut [Limb], elems: &Vec<&str>, i: usize) { + fn consume_point_elem(ops: &CommonOps, limbs_out: &mut [Limb], elems: &[&str], i: usize) { let bytes = test::from_hex(elems[i]).unwrap(); let bytes = untrusted::Input::from(&bytes); let r: Elem = elem_parse_big_endian_fixed_consttime(ops, bytes).unwrap(); @@ -1087,7 +1085,7 @@ mod tests { } fn consume_point(ops: &PrivateKeyOps, test_case: &mut test::TestCase, name: &str) -> TestPoint { - fn consume_point_elem(ops: &CommonOps, elems: &Vec<&str>, i: usize) -> Elem { + fn consume_point_elem(ops: &CommonOps, elems: &[&str], i: usize) -> Elem { let bytes = test::from_hex(elems[i]).unwrap(); let bytes = untrusted::Input::from(&bytes); let unencoded: Elem = @@ -1178,84 +1176,6 @@ mod tests { } } -#[cfg(feature = "internal_benches")] -mod internal_benches { - use super::{Limb, MAX_LIMBS}; - - pub const LIMBS_1: [Limb; MAX_LIMBS] = limbs![1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; - - pub const LIMBS_ALTERNATING_10: [Limb; MAX_LIMBS] = limbs![ - 0b10101010_10101010_10101010_10101010, - 0b10101010_10101010_10101010_10101010, - 0b10101010_10101010_10101010_10101010, - 0b10101010_10101010_10101010_10101010, - 0b10101010_10101010_10101010_10101010, - 0b10101010_10101010_10101010_10101010, - 0b10101010_10101010_10101010_10101010, - 0b10101010_10101010_10101010_10101010, - 0b10101010_10101010_10101010_10101010, - 0b10101010_10101010_10101010_10101010, - 0b10101010_10101010_10101010_10101010, - 0b10101010_10101010_10101010_10101010 - ]; -} - -#[cfg(feature = "internal_benches")] -macro_rules! bench_curve { - ( $vectors:expr ) => { - use super::super::{Elem, Scalar}; - extern crate test; - - #[bench] - fn elem_inverse_squared_bench(bench: &mut test::Bencher) { - // This benchmark assumes that `elem_inverse_squared()` is - // constant-time so inverting 1 mod q is as good of a choice as - // anything. - let mut a = Elem::zero(); - a.limbs[0] = 1; - bench.iter(|| { - let _ = PRIVATE_KEY_OPS.elem_inverse_squared(&a); - }); - } - - #[bench] - fn elem_product_bench(bench: &mut test::Bencher) { - // This benchmark assumes that the multiplication is constant-time - // so 0 * 0 is as good of a choice as anything. - let a: Elem = Elem::zero(); - let b: Elem = Elem::zero(); - bench.iter(|| { - let _ = COMMON_OPS.elem_product(&a, &b); - }); - } - - #[bench] - fn elem_squared_bench(bench: &mut test::Bencher) { - // This benchmark assumes that the squaring is constant-time so - // 0**2 * 0 is as good of a choice as anything. - let a = Elem::zero(); - bench.iter(|| { - let _ = COMMON_OPS.elem_squared(&a); - }); - } - - #[bench] - fn scalar_inv_to_mont_bench(bench: &mut test::Bencher) { - const VECTORS: &[Scalar] = $vectors; - let vectors_len = VECTORS.len(); - let mut i = 0; - bench.iter(|| { - let _ = SCALAR_OPS.scalar_inv_to_mont(&VECTORS[i]); - - i += 1; - if i == vectors_len { - i = 0; - } - }); - } - }; -} - mod elem; pub mod p256; pub mod p384; diff --git a/src/ec/suite_b/ops/p256.rs b/src/ec/suite_b/ops/p256.rs index 69c89eb893..4c54f5c00e 100644 --- a/src/ec/suite_b/ops/p256.rs +++ b/src/ec/suite_b/ops/p256.rs @@ -380,36 +380,3 @@ extern "C" { rep: Limb, ); } - -#[cfg(feature = "internal_benches")] -mod internal_benches { - use super::{super::internal_benches::*, *}; - - bench_curve!(&[ - Scalar { - limbs: LIMBS_1, - m: PhantomData, - encoding: PhantomData, - }, - Scalar { - limbs: LIMBS_ALTERNATING_10, - m: PhantomData, - encoding: PhantomData, - }, - Scalar { - // n - 1 - limbs: p256_limbs![ - 0xfc632551 - 1, - 0xf3b9cac2, - 0xa7179e84, - 0xbce6faad, - 0xffffffff, - 0xffffffff, - 0x00000000, - 0xffffffff - ], - m: PhantomData, - encoding: PhantomData, - }, - ]); -} diff --git a/src/ec/suite_b/ops/p384.rs b/src/ec/suite_b/ops/p384.rs index 4b2ecb8300..7ecba1f18f 100644 --- a/src/ec/suite_b/ops/p384.rs +++ b/src/ec/suite_b/ops/p384.rs @@ -368,40 +368,3 @@ extern "C" { b: *const Limb, // [COMMON_OPS.num_limbs] ); } - -#[cfg(feature = "internal_benches")] -mod internal_benches { - use super::{super::internal_benches::*, *}; - - bench_curve!(&[ - Scalar { - limbs: LIMBS_1, - encoding: PhantomData, - m: PhantomData - }, - Scalar { - limbs: LIMBS_ALTERNATING_10, - encoding: PhantomData, - m: PhantomData - }, - Scalar { - // n - 1 - limbs: p384_limbs![ - 0xccc52973 - 1, - 0xecec196a, - 0x48b0a77a, - 0x581a0db2, - 0xf4372ddf, - 0xc7634d81, - 0xffffffff, - 0xffffffff, - 0xffffffff, - 0xffffffff, - 0xffffffff, - 0xffffffff - ], - encoding: PhantomData, - m: PhantomData, - }, - ]); -} diff --git a/src/ec/suite_b/private_key.rs b/src/ec/suite_b/private_key.rs index fb16b245e9..31c35664f6 100644 --- a/src/ec/suite_b/private_key.rs +++ b/src/ec/suite_b/private_key.rs @@ -22,7 +22,6 @@ use crate::{ limb::{self, LIMB_BYTES}, rand, }; -use untrusted; /// Generates a random scalar in the range [1, n). pub fn random_scalar( diff --git a/src/ec/suite_b/public_key.rs b/src/ec/suite_b/public_key.rs index 4521af339e..5bafa36039 100644 --- a/src/ec/suite_b/public_key.rs +++ b/src/ec/suite_b/public_key.rs @@ -17,7 +17,6 @@ use super::{ops::*, verify_affine_point_is_on_the_curve}; use crate::{arithmetic::montgomery::*, error}; -use untrusted; /// Parses a public key encoded in uncompressed form. The key is validated /// using the ECC Partial Public-Key Validation Routine from @@ -69,7 +68,6 @@ pub fn parse_uncompressed_point( mod tests { use super::{super::ops, *}; use crate::test; - use untrusted; #[test] fn parse_uncompressed_point_test() { diff --git a/src/endian.rs b/src/endian.rs index 77ecd91e2d..962397c561 100644 --- a/src/endian.rs +++ b/src/endian.rs @@ -2,7 +2,12 @@ use core::{convert::TryInto, num::Wrapping}; /// An `Encoding` of a type `T` can be converted to/from its byte /// representation without any byte swapping or other computation. -pub trait Encoding: From + Into { +/// +/// The `Self: Copy` constraint addresses `clippy::declare_interior_mutable_const`. +pub trait Encoding: From + Into +where + Self: Copy, +{ const ZERO: Self; } @@ -19,6 +24,12 @@ pub trait ArrayEncoding { fn as_byte_array(&self) -> &T; } +/// Work around the inability to implement `from` for arrays of `Encoding`s +/// due to the coherence rules. +pub trait FromByteArray { + fn from_byte_array(a: &T) -> Self; +} + macro_rules! define_endian { ($endian:ident) => { #[repr(transparent)] @@ -44,15 +55,29 @@ macro_rules! define_endian { }; } -macro_rules! impl_as_ref { +macro_rules! impl_from_byte_array { + ($endian:ident, $base:ident, $elems:expr) => { + impl FromByteArray<[u8; $elems * core::mem::size_of::<$base>()]> + for [$endian<$base>; $elems] + { + fn from_byte_array(a: &[u8; $elems * core::mem::size_of::<$base>()]) -> Self { + unsafe { core::mem::transmute_copy(a) } + } + } + }; +} + +macro_rules! impl_array_encoding { ($endian:ident, $base:ident, $elems:expr) => { impl ArrayEncoding<[u8; $elems * core::mem::size_of::<$base>()]> for [$endian<$base>; $elems] { - fn as_byte_array<'a>(&'a self) -> &'a [u8; $elems * core::mem::size_of::<$base>()] { + fn as_byte_array(&self) -> &[u8; $elems * core::mem::size_of::<$base>()] { as_byte_slice(self).try_into().unwrap() } } + + impl_from_byte_array!($endian, $base, $elems); }; } @@ -97,10 +122,11 @@ macro_rules! impl_endian { } } - impl_as_ref!($endian, $base, 1); - impl_as_ref!($endian, $base, 2); - impl_as_ref!($endian, $base, 3); - impl_as_ref!($endian, $base, 4); + impl_array_encoding!($endian, $base, 1); + impl_array_encoding!($endian, $base, 2); + impl_array_encoding!($endian, $base, 3); + impl_array_encoding!($endian, $base, 4); + impl_from_byte_array!($endian, $base, 8); }; } diff --git a/src/error.rs b/src/error.rs index 65d3df0489..23e2ab32a9 100644 --- a/src/error.rs +++ b/src/error.rs @@ -14,8 +14,6 @@ //! Error reporting. -use untrusted; - #[cfg(feature = "std")] extern crate std; @@ -78,15 +76,30 @@ extern crate std; #[derive(Clone, Copy, Debug, PartialEq)] pub struct Unspecified; +impl Unspecified { + fn description_() -> &'static str { + "ring::error::Unspecified" + } +} + // This is required for the implementation of `std::error::Error`. impl core::fmt::Display for Unspecified { fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { - f.write_str("ring::error::Unspecified") + f.write_str(Self::description_()) } } #[cfg(feature = "std")] -impl std::error::Error for Unspecified {} +impl std::error::Error for Unspecified { + #[inline] + fn cause(&self) -> Option<&dyn std::error::Error> { + None + } + + fn description(&self) -> &str { + Self::description_() + } +} impl From for Unspecified { fn from(_: untrusted::EndOfInput) -> Self { @@ -102,10 +115,10 @@ impl From for Unspecified { /// An error parsing or validating a key. /// -/// The `Display` implementation will return a string that will help you better -/// understand why a key was rejected change which errors are reported in which -/// situations while minimizing the likelihood that any applications will be -/// broken. +/// The `Display` implementation and `::description()` +/// will return a string that will help you better understand why a key was +/// rejected change which errors are reported in which situations while +/// minimizing the likelihood that any applications will be broken. /// /// Here is an incomplete list of reasons a key may be unsupported: /// @@ -134,6 +147,11 @@ impl From for Unspecified { pub struct KeyRejected(&'static str); impl KeyRejected { + /// The value returned from ::description() + pub fn description_(&self) -> &'static str { + self.0 + } + pub(crate) fn inconsistent_components() -> Self { KeyRejected("InconsistentComponents") } @@ -185,11 +203,19 @@ impl KeyRejected { } #[cfg(feature = "std")] -impl std::error::Error for KeyRejected {} +impl std::error::Error for KeyRejected { + fn cause(&self) -> Option<&dyn std::error::Error> { + None + } + + fn description(&self) -> &str { + self.description_() + } +} impl core::fmt::Display for KeyRejected { fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { - f.write_str(self.0) + f.write_str(self.description_()) } } diff --git a/src/hmac.rs b/src/hmac.rs index 1329058f66..3e2d7e7b0f 100644 --- a/src/hmac.rs +++ b/src/hmac.rs @@ -105,9 +105,9 @@ //! //! [RFC 2104]: https://tools.ietf.org/html/rfc2104 //! [code for `ring::pbkdf2`]: -//! https://github.com/briansmith/ring/blob/master/src/pbkdf2.rs +//! https://github.com/briansmith/ring/blob/main/src/pbkdf2.rs //! [code for `ring::hkdf`]: -//! https://github.com/briansmith/ring/blob/master/src/hkdf.rs +//! https://github.com/briansmith/ring/blob/main/src/hkdf.rs use crate::{constant_time, digest, error, hkdf, rand}; @@ -182,7 +182,9 @@ impl Key { /// random value generated from `rng`. /// /// The key will be `digest_alg.output_len` bytes long, based on the - /// recommendation in https://tools.ietf.org/html/rfc2104#section-3. + /// recommendation in [RFC 2104 Section 3]. + /// + /// [RFC 2104 Section 3]: https://tools.ietf.org/html/rfc2104#section-3 pub fn generate( algorithm: Algorithm, rng: &dyn rand::SecureRandom, @@ -363,7 +365,7 @@ mod tests { // completely wacky. #[test] pub fn hmac_signing_key_coverage() { - let mut rng = rand::SystemRandom::new(); + let rng = rand::SystemRandom::new(); const HELLO_WORLD_GOOD: &[u8] = b"hello, world"; const HELLO_WORLD_BAD: &[u8] = b"hello, worle"; @@ -374,7 +376,7 @@ mod tests { hmac::HMAC_SHA384, hmac::HMAC_SHA512, ] { - let key = hmac::Key::generate(*algorithm, &mut rng).unwrap(); + let key = hmac::Key::generate(*algorithm, &rng).unwrap(); let tag = hmac::sign(&key, HELLO_WORLD_GOOD); assert!(hmac::verify(&key, HELLO_WORLD_GOOD, tag.as_ref()).is_ok()); assert!(hmac::verify(&key, HELLO_WORLD_BAD, tag.as_ref()).is_err()) diff --git a/src/io/der.rs b/src/io/der.rs index 325d6f0d8b..1a00d85999 100644 --- a/src/io/der.rs +++ b/src/io/der.rs @@ -18,7 +18,6 @@ use super::Positive; use crate::error; -use untrusted; pub const CONSTRUCTED: u8 = 1 << 5; pub const CONTEXT_SPECIFIC: u8 = 2 << 6; @@ -212,7 +211,6 @@ pub fn positive_integer<'a>( mod tests { use super::*; use crate::error; - use untrusted; fn with_good_i(value: &[u8], f: F) where diff --git a/src/lib.rs b/src/lib.rs index 486c0203f6..0e2be35ff3 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -15,8 +15,6 @@ //! Safe, fast, small crypto using Rust with BoringSSL's cryptography //! primitives. //! -//! git clone https://github.com/briansmith/ring -//! //! # Feature Flags //! //! @@ -33,22 +31,36 @@ //! fallbacks will not occur. See the documentation for //! rand::SystemRandom for more details. //!
std -//! Enable features that use libstd, in particular `std::error::Error` -//! integration. +//! Enable features that use libstd, in particular +//! std::error::Error integration. //!
wasm32_c //! Enables features that require a C compiler on wasm32 targets, such as //! the constant_time module, HMAC verification, and PBKDF2 //! verification. Without this feature, only a subset of functionality //! is provided to wasm32 targets so that a C compiler isn't needed. A //! typical invocation would be: -//! TARGET_AR=llvm-ar cargo test --target=wasm32-unknown-unknown --features=wasm32_c -//! with llvm-ar and clang in $PATH. +//! TARGET_CC=clang-10 TARGET_AR=llvm-ar-10 cargo test --target=wasm32-unknown-unknown --features=wasm32_c +//! with llvm-ar-10 and clang-10 in $PATH. //! (Going forward more functionality should be enabled by default, without //! requiring these hacks, and without requiring a C compiler.) //!
#![doc(html_root_url = "https://briansmith.org/rustdoc/")] #![allow( + clippy::collapsible_if, + clippy::identity_op, + clippy::len_without_is_empty, + clippy::len_zero, + clippy::let_unit_value, + clippy::many_single_char_names, + clippy::needless_range_loop, + clippy::new_without_default, + clippy::neg_cmp_op_on_partial_ord, + clippy::range_plus_one, + clippy::too_many_arguments, + clippy::trivially_copy_pass_by_ref, + clippy::type_complexity, + clippy::unreadable_literal, missing_copy_implementations, missing_debug_implementations, non_camel_case_types, @@ -57,24 +69,9 @@ )] // `#[derive(...)]` uses `trivial_numeric_casts` and `unused_qualifications` // internally. -#![deny( - missing_docs, - // unstable_features, // Used by `internal_benches` - unused_qualifications, - variant_size_differences, -)] -#![forbid( - anonymous_parameters, - trivial_casts, - trivial_numeric_casts, - unused_extern_crates, - unused_import_braces, - unused_results, - // warnings -)] +#![deny(missing_docs, unused_qualifications, variant_size_differences)] +#![forbid(unused_results)] #![no_std] -#![cfg_attr(feature = "internal_benches", allow(unstable_features), feature(test))] - #![cfg_attr(all(target_env = "sgx", feature = "sgx"), feature(stdsimd))] #[cfg(feature = "alloc")] diff --git a/src/limb.rs b/src/limb.rs index 65fdae5ca7..64fb536c8c 100644 --- a/src/limb.rs +++ b/src/limb.rs @@ -19,7 +19,6 @@ //! limbs use the native endianness. use crate::{c, error}; -use untrusted; #[cfg(feature = "alloc")] use crate::bits; @@ -350,7 +349,6 @@ extern "C" { #[cfg(test)] mod tests { use super::*; - use untrusted; const MAX: Limb = LimbMask::True as Limb; diff --git a/src/pbkdf2.rs b/src/pbkdf2.rs index 4f957f453d..c2bd33e3f0 100644 --- a/src/pbkdf2.rs +++ b/src/pbkdf2.rs @@ -122,7 +122,7 @@ pub struct Algorithm(hmac::Algorithm); /// PBKDF2 using HMAC-SHA1. pub static PBKDF2_HMAC_SHA1: Algorithm = Algorithm(hmac::HMAC_SHA1_FOR_LEGACY_USE_ONLY); -/// PBKDF2 using HMAC-h. +/// PBKDF2 using HMAC-SHA256. pub static PBKDF2_HMAC_SHA256: Algorithm = Algorithm(hmac::HMAC_SHA256); /// PBKDF2 using HMAC-SHA384. diff --git a/src/pkcs8.rs b/src/pkcs8.rs index 53bec5ae81..c3ca49918a 100644 --- a/src/pkcs8.rs +++ b/src/pkcs8.rs @@ -17,7 +17,6 @@ //! [RFC 5958]: https://tools.ietf.org/html/rfc5958. use crate::{ec, error, io::der}; -use untrusted; pub(crate) enum Version { V1Only, diff --git a/src/rand.rs b/src/rand.rs index d69a7a6c40..1af1571128 100644 --- a/src/rand.rs +++ b/src/rand.rs @@ -180,10 +180,12 @@ use self::sysrand::fill as fill_impl; use self::sysrand_or_urandom::fill as fill_impl; #[cfg(any( + target_os = "dragonfly", target_os = "freebsd", + target_os = "illumos", target_os = "netbsd", target_os = "openbsd", - target_os = "solaris" + target_os = "solaris", ))] use self::urandom::fill as fill_impl; @@ -354,10 +356,12 @@ mod sysrand_or_urandom { any(target_os = "android", target_os = "linux"), feature = "dev_urandom_fallback" ), + target_os = "dragonfly", target_os = "freebsd", target_os = "netbsd", target_os = "openbsd", - target_os = "solaris" + target_os = "solaris", + target_os = "illumos" ))] mod urandom { use crate::error; @@ -444,7 +448,7 @@ mod rdrandom { let mut buf = [0u8; 8]; match Result::from(unsafe { CRYPTO_rdrand(&mut buf) }) { Ok(()) => return Ok(buf), - Err(_) => continue + Err(_) => continue, } } Err(error::Unspecified) diff --git a/src/rsa.rs b/src/rsa.rs index 04a3bf87bd..9adb3285dd 100644 --- a/src/rsa.rs +++ b/src/rsa.rs @@ -24,7 +24,6 @@ use crate::{ io::{self, der}, limb, }; -use untrusted; mod padding; diff --git a/src/rsa/padding.rs b/src/rsa/padding.rs index 3ba00e10f0..f6b4cf6c74 100644 --- a/src/rsa/padding.rs +++ b/src/rsa/padding.rs @@ -14,7 +14,6 @@ use super::PUBLIC_KEY_PUBLIC_MODULUS_MAX_LEN; use crate::{bits, digest, error, io::der}; -use untrusted; #[cfg(feature = "alloc")] use crate::rand; @@ -100,7 +99,7 @@ impl Verification for PKCS1 { let mut calculated = [0u8; PUBLIC_KEY_PUBLIC_MODULUS_MAX_LEN]; let calculated = &mut calculated[..mod_bits.as_usize_bytes_rounded_up()]; pkcs1_encode(&self, m_hash, calculated); - if m.read_bytes_to_end() != *calculated.as_ref() { + if m.read_bytes_to_end() != *calculated { return Err(error::Unspecified); } Ok(()) @@ -279,7 +278,7 @@ impl RsaEncoding for PSS { { // Steps 7. - let masked_db = masked_db.into_iter(); + let masked_db = masked_db.iter_mut(); // `PS` is all zero bytes, so skipping `ps_len` bytes is equivalent // to XORing `PS` onto `db`. let mut masked_db = masked_db.skip(metrics.ps_len); @@ -522,7 +521,6 @@ mod test { use super::*; use crate::{digest, error, test}; use alloc::vec; - use untrusted; #[test] fn test_pss_padding_verify() { diff --git a/src/rsa/signing.rs b/src/rsa/signing.rs index 9ae904e4b2..52d857d302 100644 --- a/src/rsa/signing.rs +++ b/src/rsa/signing.rs @@ -25,7 +25,6 @@ use crate::{ pkcs8, rand, signature, }; use alloc::boxed::Box; -use untrusted; /// An RSA key pair, used for signing. pub struct RsaKeyPair { @@ -621,8 +620,7 @@ mod tests { const MESSAGE: &[u8] = b"hello, world"; let rng = rand::SystemRandom::new(); - const PRIVATE_KEY_DER: &'static [u8] = - include_bytes!("signature_rsa_example_private_key.der"); + const PRIVATE_KEY_DER: &[u8] = include_bytes!("signature_rsa_example_private_key.der"); let key_pair = signature::RsaKeyPair::from_der(PRIVATE_KEY_DER).unwrap(); // The output buffer is one byte too short. diff --git a/src/rsa/verification.rs b/src/rsa/verification.rs index cedf64f783..f898f211a6 100644 --- a/src/rsa/verification.rs +++ b/src/rsa/verification.rs @@ -22,8 +22,6 @@ use crate::{ sealed, signature, }; -use untrusted; - #[derive(Debug)] pub struct Key { pub n: bigint::Modulus, diff --git a/src/signature.rs b/src/signature.rs index e325dc0a7a..bef92dc4b8 100644 --- a/src/signature.rs +++ b/src/signature.rs @@ -111,7 +111,7 @@ //! [NIST Special Publication 800-56A, revision 2]: //! http://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-56Ar2.pdf //! [Suite B implementer's guide to FIPS 186-3]: -//! https://github.com/briansmith/ring/blob/master/doc/ecdsa.pdf +//! https://github.com/briansmith/ring/blob/main/doc/ecdsa.pdf //! [RFC 3279 Section 2.2.3]: //! https://tools.ietf.org/html/rfc3279#section-2.2.3 //! [RFC 3447 Section 8.2]: @@ -132,7 +132,7 @@ //! signature::{self, KeyPair}, //! }; //! -//! # fn sign_and_verify_ed25519() -> Result<(), ring::error::Unspecified> { +//! # fn main() -> Result<(), ring::error::Unspecified> { //! // Generate a key pair in PKCS#8 (v2) format. //! let rng = rand::SystemRandom::new(); //! let pkcs8_bytes = signature::Ed25519KeyPair::generate_pkcs8(&rng)?; @@ -160,8 +160,6 @@ //! //! # Ok(()) //! # } -//! -//! # fn main() { sign_and_verify_ed25519().unwrap() } //! ``` //! //! ## Signing and verifying with RSA (PKCS#1 1.5 padding) @@ -257,7 +255,6 @@ //! ``` use crate::{cpu, ec, error, sealed}; -use untrusted; pub use crate::ec::{ curve25519::ed25519::{ diff --git a/tests/aead_tests.rs b/tests/aead_tests.rs index 0fc7ff18e8..75e0e9e92d 100644 --- a/tests/aead_tests.rs +++ b/tests/aead_tests.rs @@ -13,23 +13,6 @@ // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. #![cfg(any(not(target_arch = "wasm32"), feature = "wasm32_c"))] -#![forbid( - anonymous_parameters, - box_pointers, - missing_copy_implementations, - missing_debug_implementations, - missing_docs, - trivial_casts, - trivial_numeric_casts, - unsafe_code, - unstable_features, - unused_extern_crates, - unused_import_braces, - unused_qualifications, - unused_results, - variant_size_differences, - warnings -)] #[cfg(target_arch = "wasm32")] use wasm_bindgen_test::{wasm_bindgen_test, wasm_bindgen_test_configure}; @@ -208,22 +191,20 @@ fn test_aead( ]; let mut more_comprehensive_in_prefix_lengths = [0; 4096]; - let in_prefix_lengths; - if cfg!(debug_assertions) { - in_prefix_lengths = &MINIMAL_IN_PREFIX_LENS[..]; + let in_prefix_lengths = if cfg!(debug_assertions) { + &MINIMAL_IN_PREFIX_LENS[..] } else { + #[allow(clippy::needless_range_loop)] for b in 0..more_comprehensive_in_prefix_lengths.len() { more_comprehensive_in_prefix_lengths[b] = b; } - in_prefix_lengths = &more_comprehensive_in_prefix_lengths[..]; - } + &more_comprehensive_in_prefix_lengths[..] + }; let mut o_in_out = vec![123u8; 4096]; - for in_prefix_len in in_prefix_lengths.iter() { + for &in_prefix_len in in_prefix_lengths.iter() { o_in_out.truncate(0); - for _ in 0..*in_prefix_len { - o_in_out.push(123); - } + o_in_out.resize(in_prefix_len, 123); o_in_out.extend_from_slice(&ct[..]); let nonce = aead::Nonce::try_assume_unique_for_key(&nonce_bytes).unwrap(); @@ -233,7 +214,7 @@ fn test_aead( nonce, aead::Aad::from(&aad[..]), &mut o_in_out, - *in_prefix_len.., + in_prefix_len.., ); match error { None => { @@ -300,6 +281,7 @@ fn open_with_less_safe_key<'a>( key.open_within(nonce, aad, in_out, ciphertext_and_tag) } +#[allow(clippy::range_plus_one)] fn test_aead_key_sizes(aead_alg: &'static aead::Algorithm) { let key_len = aead_alg.key_len(); let key_data = vec![0u8; key_len * 2]; @@ -327,6 +309,7 @@ fn test_aead_key_sizes(aead_alg: &'static aead::Algorithm) { } // Test that we reject non-standard nonce sizes. +#[allow(clippy::range_plus_one)] #[test] fn test_aead_nonce_sizes() -> Result<(), error::Unspecified> { let nonce_len = aead::NONCE_LEN; @@ -350,6 +333,7 @@ fn test_aead_nonce_sizes() -> Result<(), error::Unspecified> { target_arch = "x86_64", target_arch = "x86" ))] +#[allow(clippy::range_plus_one)] #[test] fn aead_chacha20_poly1305_openssh() { // TODO: test_aead_key_sizes(...); @@ -380,7 +364,7 @@ fn aead_chacha20_poly1305_openssh() { let mut tag = [0u8; aead::chacha20_poly1305_openssh::TAG_LEN]; let mut s_in_out = plaintext.clone(); let s_key = aead::chacha20_poly1305_openssh::SealingKey::new(&key_bytes); - let () = s_key.seal_in_place(sequence_num, &mut s_in_out[..], &mut tag); + s_key.seal_in_place(sequence_num, &mut s_in_out[..], &mut tag); assert_eq!(&ct, &s_in_out); assert_eq!(&expected_tag, &tag); let o_key = aead::chacha20_poly1305_openssh::OpeningKey::new(&key_bytes); diff --git a/tests/agreement_tests.rs b/tests/agreement_tests.rs index 7e4152d139..4162015378 100644 --- a/tests/agreement_tests.rs +++ b/tests/agreement_tests.rs @@ -12,30 +12,12 @@ // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. -#![forbid( - anonymous_parameters, - box_pointers, - missing_copy_implementations, - missing_debug_implementations, - missing_docs, - trivial_casts, - trivial_numeric_casts, - unsafe_code, - unstable_features, - unused_extern_crates, - unused_import_braces, - unused_qualifications, - unused_results, - variant_size_differences, - warnings -)] - extern crate alloc; use ring::{agreement, error, rand, test, test_file}; #[test] -fn agreement_traits<'a>() { +fn agreement_traits() { use alloc::vec::Vec; let rng = rand::SystemRandom::new(); @@ -61,9 +43,9 @@ fn agreement_traits<'a>() { // TODO: Test the actual output. let _: &dyn core::fmt::Debug = &public_key; - test::compile_time_assert_clone::>(); - test::compile_time_assert_copy::>(); - test::compile_time_assert_sync::>(); + test::compile_time_assert_clone::>(); + test::compile_time_assert_copy::>(); + test::compile_time_assert_sync::>(); test::compile_time_assert_clone::>>(); test::compile_time_assert_sync::>>(); @@ -105,13 +87,12 @@ fn agreement_agree_ephemeral() { assert_eq!(my_private.algorithm(), alg); - assert!( + let result = agreement::agree_ephemeral(my_private, &peer_public, (), |key_material| { assert_eq!(key_material, &output[..]); Ok(()) - }) - .is_ok() - ); + }); + assert_eq!(result, Ok(())); } Some(_) => { @@ -134,7 +115,7 @@ fn agreement_agree_ephemeral() { } } - return Ok(()); + Ok(()) }); } diff --git a/tests/constant_time_tests.rs b/tests/constant_time_tests.rs index 37bcebd93b..422ab2c8b9 100644 --- a/tests/constant_time_tests.rs +++ b/tests/constant_time_tests.rs @@ -28,7 +28,7 @@ fn test_verify_slices_are_equal() { let initial: [u8; 256] = rand::generate(&rand::SystemRandom::new()).unwrap().expose(); { - let copy = initial.clone(); + let copy = initial; for len in 0..copy.len() { // Not equal because the lengths do not match. assert_eq!( @@ -50,7 +50,7 @@ fn test_verify_slices_are_equal() { for i in 0..initial.len() { for bit in 0..8 { - let mut copy = initial.clone(); + let mut copy = initial; copy[i] ^= 1u8 << bit; for len in 0..=initial.len() { @@ -67,7 +67,7 @@ fn test_verify_slices_are_equal() { // The flipped bit is outside of `b` so `a` and `b` are equal. Ok(()) }; - assert_eq!((&a == &b), expected_result.is_ok()); // Sanity check. + assert_eq!(a == b, expected_result.is_ok()); // Sanity check. assert_eq!( constant_time::verify_slices_are_equal(&a, &b), expected_result diff --git a/tests/digest_tests.rs b/tests/digest_tests.rs index 1b16bb66e0..c275de7054 100644 --- a/tests/digest_tests.rs +++ b/tests/digest_tests.rs @@ -12,24 +12,6 @@ // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. -#![forbid( - anonymous_parameters, - box_pointers, - missing_copy_implementations, - missing_debug_implementations, - missing_docs, - trivial_casts, - trivial_numeric_casts, - unsafe_code, - unstable_features, - unused_extern_crates, - unused_import_braces, - unused_qualifications, - unused_results, - variant_size_differences, - warnings -)] - use ring::{digest, test, test_file}; #[cfg(target_arch = "wasm32")] diff --git a/tests/ecdsa_tests.rs b/tests/ecdsa_tests.rs index d0d728d043..317fdbc938 100644 --- a/tests/ecdsa_tests.rs +++ b/tests/ecdsa_tests.rs @@ -12,24 +12,6 @@ // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. -#![forbid( - anonymous_parameters, - box_pointers, - missing_copy_implementations, - missing_debug_implementations, - missing_docs, - trivial_casts, - trivial_numeric_casts, - unsafe_code, - unstable_features, - unused_extern_crates, - unused_import_braces, - unused_qualifications, - unused_results, - variant_size_differences, - warnings -)] - use ring::{ rand, signature::{self, KeyPair}, @@ -86,7 +68,7 @@ fn ecdsa_from_pkcs8_test() { match ( signature::EcdsaKeyPair::from_pkcs8(this_asn1, &input), - error.clone(), + error, ) { (Ok(_), None) => (), (Err(e), None) => panic!("Failed with error \"{}\", but expected to succeed", e), @@ -209,9 +191,11 @@ fn ecdsa_test_public_key_coverage() { assert_eq!(key_pair.public_key().as_ref(), PUBLIC_KEY); // Test `Clone`. - { - let _ = key_pair.public_key().clone(); - } + #[allow(clippy::clone_on_copy, clippy::redundant_clone)] + let _: ::PublicKey = key_pair.public_key().clone(); + + // Test `Copy`. + let _: ::PublicKey = *key_pair.public_key(); // Test `Debug`. assert_eq!(PUBLIC_KEY_DEBUG, format!("{:?}", key_pair.public_key())); diff --git a/tests/ed25519_tests.rs b/tests/ed25519_tests.rs index 0289362162..059ecdb044 100644 --- a/tests/ed25519_tests.rs +++ b/tests/ed25519_tests.rs @@ -12,24 +12,6 @@ // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. -#![forbid( - anonymous_parameters, - box_pointers, - missing_copy_implementations, - missing_debug_implementations, - missing_docs, - trivial_casts, - trivial_numeric_casts, - unsafe_code, - unstable_features, - unused_extern_crates, - unused_import_braces, - unused_qualifications, - unused_results, - variant_size_differences, - warnings -)] - use ring::{ signature::{self, Ed25519KeyPair, KeyPair}, test, test_file, @@ -114,14 +96,11 @@ fn test_ed25519_from_pkcs8_unchecked() { let input = test_case.consume_bytes("Input"); let error = test_case.consume_optional_string("Error"); - match ( - Ed25519KeyPair::from_pkcs8_maybe_unchecked(&input), - error.clone(), - ) { + match (Ed25519KeyPair::from_pkcs8_maybe_unchecked(&input), error) { (Ok(_), None) => (), (Err(e), None) => panic!("Failed with error \"{}\", but expected to succeed", e), (Ok(_), Some(e)) => panic!("Succeeded, but expected error \"{}\"", e), - (Err(actual), Some(expected)) => assert_eq!(format!("{}", actual), expected), + (Err(actual), Some(expected)) => assert_eq!(actual.description_(), expected), }; Ok(()) @@ -139,11 +118,11 @@ fn test_ed25519_from_pkcs8() { let input = test_case.consume_bytes("Input"); let error = test_case.consume_optional_string("Error"); - match (Ed25519KeyPair::from_pkcs8(&input), error.clone()) { + match (Ed25519KeyPair::from_pkcs8(&input), error) { (Ok(_), None) => (), (Err(e), None) => panic!("Failed with error \"{}\", but expected to succeed", e), (Ok(_), Some(e)) => panic!("Succeeded, but expected error \"{}\"", e), - (Err(actual), Some(expected)) => assert_eq!(format!("{}", actual), expected), + (Err(actual), Some(expected)) => assert_eq!(actual.description_(), expected), }; Ok(()) @@ -155,7 +134,7 @@ fn test_ed25519_from_pkcs8() { fn ed25519_test_public_key_coverage() { const PRIVATE_KEY: &[u8] = include_bytes!("ed25519_test_private_key.p8"); const PUBLIC_KEY: &[u8] = include_bytes!("ed25519_test_public_key.der"); - const PUBLIC_KEY_DEBUG: &'static str = + const PUBLIC_KEY_DEBUG: &str = "PublicKey(\"5809e9fef6dcec58f0f2e3b0d67e9880a11957e083ace85835c3b6c8fbaf6b7d\")"; let key_pair = signature::Ed25519KeyPair::from_pkcs8(PRIVATE_KEY).unwrap(); @@ -164,7 +143,11 @@ fn ed25519_test_public_key_coverage() { assert_eq!(key_pair.public_key().as_ref(), PUBLIC_KEY); // Test `Clone`. - let _ = key_pair.public_key().clone(); + #[allow(clippy::clone_on_copy)] + let _: ::PublicKey = key_pair.public_key().clone(); + + // Test `Copy`. + let _: ::PublicKey = *key_pair.public_key(); // Test `Debug`. assert_eq!(PUBLIC_KEY_DEBUG, format!("{:?}", key_pair.public_key())); diff --git a/tests/hkdf_tests.rs b/tests/hkdf_tests.rs index e17968c456..88435a845e 100644 --- a/tests/hkdf_tests.rs +++ b/tests/hkdf_tests.rs @@ -12,23 +12,6 @@ // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. -#![forbid( - anonymous_parameters, - box_pointers, - missing_copy_implementations, - missing_debug_implementations, - missing_docs, - trivial_casts, - trivial_numeric_casts, - unsafe_code, - unstable_features, - unused_extern_crates, - unused_import_braces, - unused_results, - variant_size_differences, - warnings -)] - use ring::{digest, error, hkdf, test, test_file}; #[cfg(target_arch = "wasm32")] diff --git a/tests/hmac_tests.rs b/tests/hmac_tests.rs index 9e01714eb6..486a90a530 100644 --- a/tests/hmac_tests.rs +++ b/tests/hmac_tests.rs @@ -12,24 +12,6 @@ // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. -#![forbid( - anonymous_parameters, - box_pointers, - missing_copy_implementations, - missing_debug_implementations, - missing_docs, - trivial_casts, - trivial_numeric_casts, - unsafe_code, - unstable_features, - unused_extern_crates, - unused_import_braces, - unused_qualifications, - unused_results, - variant_size_differences, - warnings -)] - use ring::{digest, error, hmac, test, test_file}; #[cfg(target_arch = "wasm32")] diff --git a/tests/pbkdf2_tests.rs b/tests/pbkdf2_tests.rs index 0b0cf94b3b..13300fa46e 100644 --- a/tests/pbkdf2_tests.rs +++ b/tests/pbkdf2_tests.rs @@ -12,24 +12,6 @@ // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. -#![forbid( - anonymous_parameters, - box_pointers, - missing_copy_implementations, - missing_debug_implementations, - missing_docs, - trivial_casts, - trivial_numeric_casts, - unsafe_code, - unstable_features, - unused_extern_crates, - unused_import_braces, - unused_qualifications, - unused_results, - variant_size_differences, - warnings -)] - use core::num::NonZeroU32; use ring::{digest, error, pbkdf2, test, test_file}; diff --git a/tests/quic_tests.rs b/tests/quic_tests.rs index 472938f87d..545d7a76fb 100644 --- a/tests/quic_tests.rs +++ b/tests/quic_tests.rs @@ -12,24 +12,6 @@ // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. -#![forbid( - anonymous_parameters, - box_pointers, - missing_copy_implementations, - missing_debug_implementations, - missing_docs, - trivial_casts, - trivial_numeric_casts, - unsafe_code, - unstable_features, - unused_extern_crates, - unused_import_braces, - unused_qualifications, - unused_results, - variant_size_differences, - warnings -)] - use ring::{aead::quic, test, test_file}; #[test] @@ -64,6 +46,7 @@ fn test_quic(alg: &'static quic::Algorithm, test_file: test::File) { }); } +#[allow(clippy::range_plus_one)] fn test_sample_len(alg: &'static quic::Algorithm) { let key_len = alg.key_len(); let key_data = vec![0u8; key_len]; diff --git a/tests/rsa_tests.rs b/tests/rsa_tests.rs index 03e062e29f..2b29b26150 100644 --- a/tests/rsa_tests.rs +++ b/tests/rsa_tests.rs @@ -12,24 +12,6 @@ // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. -#![forbid( - anonymous_parameters, - box_pointers, - missing_copy_implementations, - missing_debug_implementations, - missing_docs, - trivial_casts, - trivial_numeric_casts, - unsafe_code, - unstable_features, - unused_extern_crates, - unused_import_braces, - unused_qualifications, - unused_results, - variant_size_differences, - warnings -)] - #[cfg(feature = "alloc")] use ring::{ error, diff --git a/util/ar/ar.go b/util/ar/ar.go deleted file mode 100644 index 756caf53d8..0000000000 --- a/util/ar/ar.go +++ /dev/null @@ -1,154 +0,0 @@ -// Copyright (c) 2017, Google Inc. -// -// Permission to use, copy, modify, and/or distribute this software for any -// purpose with or without fee is hereby granted, provided that the above -// copyright notice and this permission notice appear in all copies. -// -// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES -// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF -// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY -// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES -// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION -// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN -// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -// ar.go contains functions for parsing .a archive files. - -package ar - -import ( - "bytes" - "errors" - "fmt" - "io" - "strconv" - "strings" -) - -// ParseAR parses an archive file from r and returns a map from filename to -// contents, or else an error. -func ParseAR(r io.Reader) (map[string][]byte, error) { - // See https://en.wikipedia.org/wiki/Ar_(Unix)#File_format_details - const expectedMagic = "!\n" - var magic [len(expectedMagic)]byte - if _, err := io.ReadFull(r, magic[:]); err != nil { - return nil, err - } - if string(magic[:]) != expectedMagic { - return nil, errors.New("ar: not an archive file") - } - - const filenameTableName = "//" - const symbolTableName = "/" - var longFilenameTable []byte - ret := make(map[string][]byte) - - for { - var header [60]byte - if _, err := io.ReadFull(r, header[:]); err != nil { - if err == io.EOF { - break - } - return nil, errors.New("ar: error reading file header: " + err.Error()) - } - - name := strings.TrimRight(string(header[:16]), " ") - sizeStr := strings.TrimRight(string(header[48:58]), "\x00 ") - size, err := strconv.ParseUint(sizeStr, 10, 64) - if err != nil { - return nil, errors.New("ar: failed to parse file size: " + err.Error()) - } - - // File contents are padded to a multiple of two bytes - storedSize := size - if storedSize%2 == 1 { - storedSize++ - } - - contents := make([]byte, storedSize) - if _, err := io.ReadFull(r, contents); err != nil { - return nil, errors.New("ar: error reading file contents: " + err.Error()) - } - contents = contents[:size] - - switch { - case name == filenameTableName: - if longFilenameTable != nil { - return nil, errors.New("ar: two filename tables found") - } - longFilenameTable = contents - continue - - case name == symbolTableName: - continue - - case len(name) > 1 && name[0] == '/': - if longFilenameTable == nil { - return nil, errors.New("ar: long filename reference found before filename table") - } - - // A long filename is stored as "/" followed by a - // base-10 offset in the filename table. - offset, err := strconv.ParseUint(name[1:], 10, 64) - if err != nil { - return nil, errors.New("ar: failed to parse filename offset: " + err.Error()) - } - if offset > uint64((^uint(0))>>1) { - return nil, errors.New("ar: filename offset overflow") - } - - if int(offset) > len(longFilenameTable) { - return nil, errors.New("ar: filename offset out of bounds") - } - - filename := longFilenameTable[offset:] - // Windows terminates filenames with NUL characters, - // while sysv/GNU uses /. - if i := bytes.IndexAny(filename, "/\x00"); i < 0 { - return nil, errors.New("ar: unterminated filename in table") - } else { - filename = filename[:i] - } - - name = string(filename) - - default: - name = strings.TrimRight(name, "/") - } - - // Post-processing for BSD: - // https://en.wikipedia.org/wiki/Ar_(Unix)#BSD_variant - // - // If the name is of the form #1/XXX, XXX identifies the length of the - // name, and the name itself is stored as a prefix of the data, possibly - // null-padded. - - var namelen uint - n, err := fmt.Sscanf(name, "#1/%d", &namelen) - if err == nil && n == 1 && len(contents) >= int(namelen) { - name = string(contents[:namelen]) - contents = contents[namelen:] - - // Names can be null padded; find the first null (if any). Note that - // this also handles the case of a null followed by non-null - // characters. It's not clear whether those can ever show up in - // practice, but we might as well handle them in case they can show - // up. - var null int - for ; null < len(name); null++ { - if name[null] == 0 { - break - } - } - name = name[:null] - } - - if name == "__.SYMDEF" || name == "__.SYMDEF SORTED" { - continue - } - - ret[name] = contents - } - - return ret, nil -} diff --git a/util/ar/ar_test.go b/util/ar/ar_test.go deleted file mode 100644 index ef37d795d2..0000000000 --- a/util/ar/ar_test.go +++ /dev/null @@ -1,118 +0,0 @@ -// Copyright (c) 2018, Google Inc. -// -// Permission to use, copy, modify, and/or distribute this software for any -// purpose with or without fee is hereby granted, provided that the above -// copyright notice and this permission notice appear in all copies. -// -// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES -// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF -// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY -// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES -// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION -// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN -// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -package ar - -import ( - "bytes" - "flag" - "io/ioutil" - "os" - "path/filepath" - "testing" -) - -var testDataDir = flag.String("testdata", "testdata", "The path to the test data directory.") - -type arTest struct { - name string - in string - out map[string]string - // allowPadding is true if the contents may have trailing newlines at end. - // On macOS, ar calls ranlib which pads all inputs up to eight bytes with - // newlines. Unlike ar's native padding up to two bytes, this padding is - // included in the size field, so it is not removed when decoding. - allowPadding bool -} - -func (test *arTest) Path(file string) string { - return filepath.Join(*testDataDir, test.name, file) -} - -func removeTrailingNewlines(in []byte) []byte { - for len(in) > 0 && in[len(in)-1] == '\n' { - in = in[:len(in)-1] - } - return in -} - -var arTests = []arTest{ - { - "linux", - "libsample.a", - map[string]string{ - "foo.c.o": "foo.c.o", - "bar.cc.o": "bar.cc.o", - }, - false, - }, - { - "mac", - "libsample.a", - map[string]string{ - "foo.c.o": "foo.c.o", - "bar.cc.o": "bar.cc.o", - }, - true, - }, - { - "windows", - "sample.lib", - map[string]string{ - "CMakeFiles\\sample.dir\\foo.c.obj": "foo.c.obj", - "CMakeFiles\\sample.dir\\bar.cc.obj": "bar.cc.obj", - }, - false, - }, -} - -func TestAR(t *testing.T) { - for _, test := range arTests { - t.Run(test.name, func(t *testing.T) { - in, err := os.Open(test.Path(test.in)) - if err != nil { - t.Fatalf("opening input failed: %s", err) - } - defer in.Close() - - ret, err := ParseAR(in) - if err != nil { - t.Fatalf("reading input failed: %s", err) - } - - for file, contentsPath := range test.out { - expected, err := ioutil.ReadFile(test.Path(contentsPath)) - if err != nil { - t.Fatalf("error reading %s: %s", contentsPath, err) - } - got, ok := ret[file] - if test.allowPadding { - got = removeTrailingNewlines(got) - expected = removeTrailingNewlines(got) - } - if !ok { - t.Errorf("file %s missing from output", file) - } else if !bytes.Equal(got, expected) { - t.Errorf("contents for file %s did not match", file) - } - } - - for file, _ := range ret { - if _, ok := test.out[file]; !ok { - t.Errorf("output contained unexpected file %q", file) - } - } - }) - } -} diff --git a/util/ar/testdata/linux/bar.cc.o b/util/ar/testdata/linux/bar.cc.o deleted file mode 100644 index 92e83a9a11..0000000000 Binary files a/util/ar/testdata/linux/bar.cc.o and /dev/null differ diff --git a/util/ar/testdata/linux/foo.c.o b/util/ar/testdata/linux/foo.c.o deleted file mode 100644 index 6423c1d49b..0000000000 Binary files a/util/ar/testdata/linux/foo.c.o and /dev/null differ diff --git a/util/ar/testdata/linux/libsample.a b/util/ar/testdata/linux/libsample.a deleted file mode 100644 index cae6ae70c9..0000000000 Binary files a/util/ar/testdata/linux/libsample.a and /dev/null differ diff --git a/util/ar/testdata/mac/bar.cc.o b/util/ar/testdata/mac/bar.cc.o deleted file mode 100644 index 9c60798532..0000000000 Binary files a/util/ar/testdata/mac/bar.cc.o and /dev/null differ diff --git a/util/ar/testdata/mac/foo.c.o b/util/ar/testdata/mac/foo.c.o deleted file mode 100644 index 0f96a0a018..0000000000 Binary files a/util/ar/testdata/mac/foo.c.o and /dev/null differ diff --git a/util/ar/testdata/mac/libsample.a b/util/ar/testdata/mac/libsample.a deleted file mode 100644 index b7d8eb5ce0..0000000000 Binary files a/util/ar/testdata/mac/libsample.a and /dev/null differ diff --git a/util/ar/testdata/sample/CMakeLists.txt b/util/ar/testdata/sample/CMakeLists.txt deleted file mode 100644 index 9ea2fe8ee1..0000000000 --- a/util/ar/testdata/sample/CMakeLists.txt +++ /dev/null @@ -1,3 +0,0 @@ -cmake_minimum_required(VERSION 3.0) -project(Sample) -add_library(sample STATIC foo.c bar.cc) diff --git a/util/ar/testdata/sample/bar.cc b/util/ar/testdata/sample/bar.cc deleted file mode 100644 index a0ac7e14ab..0000000000 --- a/util/ar/testdata/sample/bar.cc +++ /dev/null @@ -1,15 +0,0 @@ -extern "C" { -void foo(); -void bar() {} -} - -namespace bar_namespace { - -void SomeExternalFunction(); - -void SomeFunction() { - foo(); - SomeExternalFunction(); -} - -} // namespace bar_namespace diff --git a/util/ar/testdata/sample/foo.c b/util/ar/testdata/sample/foo.c deleted file mode 100644 index fed596cbe2..0000000000 --- a/util/ar/testdata/sample/foo.c +++ /dev/null @@ -1,7 +0,0 @@ -extern void external_symbol(void); -extern void bar(void); - -void foo(void) { - external_symbol(); - bar(); -} diff --git a/util/ar/testdata/windows/bar.cc.obj b/util/ar/testdata/windows/bar.cc.obj deleted file mode 100644 index 4a315cdd6b..0000000000 Binary files a/util/ar/testdata/windows/bar.cc.obj and /dev/null differ diff --git a/util/ar/testdata/windows/foo.c.obj b/util/ar/testdata/windows/foo.c.obj deleted file mode 100644 index 9b4aad7a42..0000000000 Binary files a/util/ar/testdata/windows/foo.c.obj and /dev/null differ diff --git a/util/ar/testdata/windows/sample.lib b/util/ar/testdata/windows/sample.lib deleted file mode 100644 index efeebb24e5..0000000000 Binary files a/util/ar/testdata/windows/sample.lib and /dev/null differ diff --git a/util/generate-asm-lcov.py b/util/generate-asm-lcov.py deleted file mode 100755 index 257ae841c3..0000000000 --- a/util/generate-asm-lcov.py +++ /dev/null @@ -1,152 +0,0 @@ -#!/usr/bin/python -# Copyright (c) 2016, Google Inc. -# -# Permission to use, copy, modify, and/or distribute this software for any -# purpose with or without fee is hereby granted, provided that the above -# copyright notice and this permission notice appear in all copies. -# -# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES -# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF -# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY -# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES -# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION -# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN -# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. -import os -import os.path -import subprocess -import sys - -# The LCOV output format for each source file is: -# -# SF: -# DA:, -# ... -# end_of_record -# -# The can either be 0 for an unexecuted instruction or a -# value representing the number of executions. The DA line should be omitted -# for lines not representing an instruction. - -SECTION_SEPERATOR = '-' * 80 - -def is_asm(l): - """Returns whether a line should be considered to be an instruction.""" - l = l.strip() - # Empty lines - if l == '': - return False - # Comments - if l.startswith('#'): - return False - # Assembly Macros - if l.startswith('.'): - return False - # Label - if l.endswith(':'): - return False - return True - -def merge(callgrind_files, srcs): - """Calls callgrind_annotate over the set of callgrind output - |callgrind_files| using the sources |srcs| and merges the results - together.""" - out = '' - for file in callgrind_files: - data = subprocess.check_output(['callgrind_annotate', file] + srcs) - out += '%s\n%s\n' % (data, SECTION_SEPERATOR) - return out - -def parse(filename, data, current): - """Parses an annotated execution flow |data| from callgrind_annotate for - source |filename| and updates the current execution counts from |current|.""" - with open(filename) as f: - source = f.read().split('\n') - - out = current - if out == None: - out = [0 if is_asm(l) else None for l in source] - - # Lines are of the following formats: - # -- line: Indicates that analysis continues from a different place. - # Ir : Indicates the start of a file. - # => : Indicates a call/jump in the control flow. - # : Indicates that the line has been executed that many times. - line = None - for l in data: - l = l.strip() + ' ' - if l.startswith('-- line'): - line = int(l.split(' ')[2]) - 1 - elif l.strip() == 'Ir': - line = 0 - elif line != None and l.strip() and '=>' not in l and 'unidentified lines' not in l: - count = l.split(' ')[0].replace(',', '').replace('.', '0') - instruction = l.split(' ', 1)[1].strip() - if count != '0' or is_asm(instruction): - if out[line] == None: - out[line] = 0 - out[line] += int(count) - line += 1 - - return out - - -def generate(data): - """Parses the merged callgrind_annotate output |data| and generates execution - counts for all annotated files.""" - out = {} - data = [p.strip() for p in data.split(SECTION_SEPERATOR)] - - - # Most sections are ignored, but a section with: - # User-annotated source: - # precedes a listing of execution count for that . - for i in range(len(data)): - if 'User-annotated source' in data[i] and i < len(data) - 1: - filename = data[i].split(':', 1)[1].strip() - res = data[i + 1] - if filename not in out: - out[filename] = None - if 'No information' in res: - res = [] - else: - res = res.split('\n') - out[filename] = parse(filename, res, out[filename]) - return out - -def output(data): - """Takes a dictionary |data| of filenames and execution counts and generates - a LCOV coverage output.""" - out = '' - for filename, counts in data.iteritems(): - out += 'SF:%s\n' % (os.path.abspath(filename)) - for line, count in enumerate(counts): - if count != None: - out += 'DA:%d,%s\n' % (line + 1, count) - out += 'end_of_record\n' - return out - -if __name__ == '__main__': - if len(sys.argv) != 3: - print '%s ' % (__file__) - sys.exit() - - cg_folder = sys.argv[1] - build_folder = sys.argv[2] - - cg_files = [] - for (cwd, _, files) in os.walk(cg_folder): - for f in files: - if f.startswith('callgrind.out'): - cg_files.append(os.path.abspath(os.path.join(cwd, f))) - - srcs = [] - for (cwd, _, files) in os.walk(build_folder): - for f in files: - fn = os.path.join(cwd, f) - if fn.endswith('.S'): - srcs.append(fn) - - annotated = merge(cg_files, srcs) - lcov = generate(annotated) - print output(lcov) diff --git a/util/generate-coverage.sh b/util/generate-coverage.sh deleted file mode 100755 index 2fbe6b8378..0000000000 --- a/util/generate-coverage.sh +++ /dev/null @@ -1,60 +0,0 @@ -#!/bin/sh -# Copyright (c) 2016, Google Inc. -# -# Permission to use, copy, modify, and/or distribute this software for any -# purpose with or without fee is hereby granted, provided that the above -# copyright notice and this permission notice appear in all copies. -# -# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES -# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF -# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY -# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES -# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION -# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN -# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - -set -xe - -SRC=$PWD - -BUILD=$(mktemp -d '/tmp/boringssl.XXXXXX') -BUILD_SRC=$(mktemp -d '/tmp/boringssl-src.XXXXXX') -LCOV=$(mktemp -d '/tmp/boringssl-lcov.XXXXXX') - -if [ -n "$1" ]; then - LCOV=$(readlink -f "$1") - mkdir -p "$LCOV" -fi - -cd "$BUILD" -cmake "$SRC" -GNinja -DCMAKE_C_FLAGS='-fprofile-arcs -ftest-coverage' \ - -DCMAKE_CXX_FLAGS='-fprofile-arcs -ftest-coverage' -DCMAKE_ASM_FLAGS='-Wa,-g' -ninja - -cp -r "$SRC/crypto" "$SRC/decrepit" "$SRC/include" "$SRC/ssl" "$SRC/tool" \ - "$BUILD_SRC" -cp -r "$BUILD"/* "$BUILD_SRC" -mkdir "$BUILD/callgrind/" - -cd "$SRC" -go run "$SRC/util/all_tests.go" -build-dir "$BUILD" -callgrind -num-workers 16 -util/generate-asm-lcov.py "$BUILD/callgrind" "$BUILD" > "$BUILD/asm.info" - -go run "util/all_tests.go" -build-dir "$BUILD" - -cd "$SRC/ssl/test/runner" -go test -shim-path "$BUILD/ssl/test/bssl_shim" -num-workers 1 - -cd "$LCOV" -lcov -c -d "$BUILD" -b "$BUILD" -o "$BUILD/lcov.info" -lcov -r "$BUILD/lcov.info" "*_test.c" -o "$BUILD/lcov-1.info" -lcov -r "$BUILD/lcov-1.info" "*_test.cc" -o "$BUILD/lcov-2.info" -cat "$BUILD/lcov-2.info" "$BUILD/asm.info" > "$BUILD/final.info" -sed -i "s;$BUILD;$BUILD_SRC;g" "$BUILD/final.info" -sed -i "s;$SRC;$BUILD_SRC;g" "$BUILD/final.info" -genhtml -p "$BUILD_SRC" "$BUILD/final.info" - -rm -rf "$BUILD" -rm -rf "$BUILD_SRC" - -xdg-open index.html diff --git a/util/make_prefix_headers.go b/util/make_prefix_headers.go deleted file mode 100644 index b536f14cea..0000000000 --- a/util/make_prefix_headers.go +++ /dev/null @@ -1,232 +0,0 @@ -// Copyright (c) 2018, Google Inc. -// -// Permission to use, copy, modify, and/or distribute this software for any -// purpose with or without fee is hereby granted, provided that the above -// copyright notice and this permission notice appear in all copies. -// -// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES -// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF -// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY -// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES -// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION -// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN -// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - -// This program takes a file containing newline-separated symbols, and generates -// boringssl_prefix_symbols.h, boringssl_prefix_symbols_asm.h, and -// boringssl_prefix_symbols_nasm.inc. These header files can be used to build -// BoringSSL with a prefix for all symbols in order to avoid symbol name -// conflicts when linking a project with multiple copies of BoringSSL; see -// BUILDING.md for more details. - -// TODO(joshlf): For platforms which support it, use '#pragma redefine_extname' -// instead of a custom macro. This avoids the need for a custom macro, but also -// ensures that our renaming won't conflict with symbols defined and used by our -// consumers (the "HMAC" problem). An example of this approach can be seen in -// IllumOS' fork of OpenSSL: -// https://github.com/joyent/illumos-extra/blob/master/openssl1x/sunw_prefix.h - -package main - -import ( - "bufio" - "flag" - "fmt" - "os" - "path/filepath" - "strings" -) - -var out = flag.String("out", ".", "Path to a directory where the outputs will be written") - -// Read newline-separated symbols from a file, ignoring any comments started -// with '#'. -func readSymbols(path string) ([]string, error) { - f, err := os.Open(path) - if err != nil { - return nil, err - } - defer f.Close() - scanner := bufio.NewScanner(f) - var ret []string - for scanner.Scan() { - line := scanner.Text() - if idx := strings.IndexByte(line, '#'); idx >= 0 { - line = line[:idx] - } - line = strings.TrimSpace(line) - if len(line) == 0 { - continue - } - ret = append(ret, line) - } - if err := scanner.Err(); err != nil { - return nil, err - } - return ret, nil -} - -func writeCHeader(symbols []string, path string) error { - f, err := os.Create(path) - if err != nil { - return err - } - defer f.Close() - - if _, err := f.WriteString(`// Copyright (c) 2018, Google Inc. -// -// Permission to use, copy, modify, and/or distribute this software for any -// purpose with or without fee is hereby granted, provided that the above -// copyright notice and this permission notice appear in all copies. -// -// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES -// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF -// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY -// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES -// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION -// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN -// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - -// BORINGSSL_ADD_PREFIX pastes two identifiers into one. It performs one -// iteration of macro expansion on its arguments before pasting. -#define BORINGSSL_ADD_PREFIX(a, b) BORINGSSL_ADD_PREFIX_INNER(a, b) -#define BORINGSSL_ADD_PREFIX_INNER(a, b) a ## _ ## b - -`); err != nil { - return err - } - - for _, symbol := range symbols { - if _, err := fmt.Fprintf(f, "#define %s BORINGSSL_ADD_PREFIX(BORINGSSL_PREFIX, %s)\n", symbol, symbol); err != nil { - return err - } - } - - return nil -} - -func writeASMHeader(symbols []string, path string) error { - f, err := os.Create(path) - if err != nil { - return err - } - defer f.Close() - - if _, err := f.WriteString(`// Copyright (c) 2018, Google Inc. -// -// Permission to use, copy, modify, and/or distribute this software for any -// purpose with or without fee is hereby granted, provided that the above -// copyright notice and this permission notice appear in all copies. -// -// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES -// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF -// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY -// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES -// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION -// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN -// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - -#if !defined(__APPLE__) -#include -#else -// On iOS and macOS, we need to treat assembly symbols differently from other -// symbols. The linker expects symbols to be prefixed with an underscore. -// Perlasm thus generates symbol with this underscore applied. Our macros must, -// in turn, incorporate it. -#define BORINGSSL_ADD_PREFIX_MAC_ASM(a, b) BORINGSSL_ADD_PREFIX_INNER_MAC_ASM(a, b) -#define BORINGSSL_ADD_PREFIX_INNER_MAC_ASM(a, b) _ ## a ## _ ## b - -`); err != nil { - return err - } - - for _, symbol := range symbols { - if _, err := fmt.Fprintf(f, "#define _%s BORINGSSL_ADD_PREFIX_MAC_ASM(BORINGSSL_PREFIX, %s)\n", symbol, symbol); err != nil { - return err - } - } - - _, err = fmt.Fprintf(f, "#endif\n") - return nil -} - -func writeNASMHeader(symbols []string, path string) error { - f, err := os.Create(path) - if err != nil { - return err - } - defer f.Close() - - // NASM uses a different syntax from the C preprocessor. - if _, err := f.WriteString(`; Copyright (c) 2018, Google Inc. -; -; Permission to use, copy, modify, and/or distribute this software for any -; purpose with or without fee is hereby granted, provided that the above -; copyright notice and this permission notice appear in all copies. -; -; THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES -; WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF -; MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY -; SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES -; WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION -; OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN -; CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - -; 32-bit Windows adds underscores to C functions, while 64-bit Windows does not. -%ifidn __OUTPUT_FORMAT__, win32 -`); err != nil { - return err - } - - for _, symbol := range symbols { - if _, err := fmt.Fprintf(f, "%%xdefine _%s _ %%+ BORINGSSL_PREFIX %%+ _%s\n", symbol, symbol); err != nil { - return err - } - } - - if _, err := fmt.Fprintf(f, "%%else\n"); err != nil { - return err - } - - for _, symbol := range symbols { - if _, err := fmt.Fprintf(f, "%%xdefine %s BORINGSSL_PREFIX %%+ _%s\n", symbol, symbol); err != nil { - return err - } - } - - if _, err := fmt.Fprintf(f, "%%endif\n"); err != nil { - return err - } - - return nil -} - -func main() { - flag.Parse() - if flag.NArg() != 1 { - fmt.Fprintf(os.Stderr, "Usage: %s [-out OUT] SYMBOLS\n", os.Args[0]) - os.Exit(1) - } - - symbols, err := readSymbols(flag.Arg(0)) - if err != nil { - fmt.Fprintf(os.Stderr, "Error reading symbols: %s\n", err) - os.Exit(1) - } - - if err := writeCHeader(symbols, filepath.Join(*out, "boringssl_prefix_symbols.h")); err != nil { - fmt.Fprintf(os.Stderr, "Error writing boringssl_prefix_symbols.h: %s\n", err) - os.Exit(1) - } - - if err := writeASMHeader(symbols, filepath.Join(*out, "boringssl_prefix_symbols_asm.h")); err != nil { - fmt.Fprintf(os.Stderr, "Error writing boringssl_prefix_symbols_asm.h: %s\n", err) - os.Exit(1) - } - - if err := writeNASMHeader(symbols, filepath.Join(*out, "boringssl_prefix_symbols_nasm.inc")); err != nil { - fmt.Fprintf(os.Stderr, "Error writing boringssl_prefix_symbols_nasm.inc: %s\n", err) - os.Exit(1) - } - -} diff --git a/util/read_symbols.go b/util/read_symbols.go deleted file mode 100644 index 791ea5d126..0000000000 --- a/util/read_symbols.go +++ /dev/null @@ -1,262 +0,0 @@ -// Copyright (c) 2018, Google Inc. -// -// Permission to use, copy, modify, and/or distribute this software for any -// purpose with or without fee is hereby granted, provided that the above -// copyright notice and this permission notice appear in all copies. -// -// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES -// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF -// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY -// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES -// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION -// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN -// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - -// read_symbols scans one or more .a files and, for each object contained in -// the .a files, reads the list of symbols in that object file. -package main - -import ( - "bytes" - "debug/elf" - "debug/macho" - "debug/pe" - "flag" - "fmt" - "os" - "runtime" - "sort" - "strings" - - "boringssl.googlesource.com/boringssl/util/ar" -) - -const ( - ObjFileFormatELF = "elf" - ObjFileFormatMachO = "macho" - ObjFileFormatPE = "pe" -) - -var ( - outFlag = flag.String("out", "-", "File to write output symbols") - objFileFormat = flag.String("obj-file-format", defaultObjFileFormat(runtime.GOOS), "Object file format to expect (options are elf, macho, pe)") -) - -func defaultObjFileFormat(goos string) string { - switch goos { - case "linux": - return ObjFileFormatELF - case "darwin": - return ObjFileFormatMachO - case "windows": - return ObjFileFormatPE - default: - // By returning a value here rather than panicking, the user can still - // cross-compile from an unsupported platform to a supported platform by - // overriding this default with a flag. If the user doesn't provide the - // flag, we will panic during flag parsing. - return "unsupported" - } -} - -func printAndExit(format string, args ...interface{}) { - s := fmt.Sprintf(format, args...) - fmt.Fprintln(os.Stderr, s) - os.Exit(1) -} - -func main() { - flag.Parse() - if flag.NArg() < 1 { - printAndExit("Usage: %s [-out OUT] [-obj-file-format FORMAT] ARCHIVE_FILE [ARCHIVE_FILE [...]]", os.Args[0]) - } - archiveFiles := flag.Args() - - out := os.Stdout - if *outFlag != "-" { - var err error - out, err = os.Create(*outFlag) - if err != nil { - printAndExit("Error opening %q: %s", *outFlag, err) - } - defer out.Close() - } - - var symbols []string - // Only add first instance of any symbol; keep track of them in this map. - added := make(map[string]struct{}) - for _, archive := range archiveFiles { - f, err := os.Open(archive) - if err != nil { - printAndExit("Error opening %s: %s", archive, err) - } - objectFiles, err := ar.ParseAR(f) - f.Close() - if err != nil { - printAndExit("Error parsing %s: %s", archive, err) - } - - for name, contents := range objectFiles { - syms, err := listSymbols(contents) - if err != nil { - printAndExit("Error listing symbols from %q in %q: %s", name, archive, err) - } - for _, s := range syms { - if _, ok := added[s]; !ok { - added[s] = struct{}{} - symbols = append(symbols, s) - } - } - } - } - - sort.Strings(symbols) - for _, s := range symbols { - var skipSymbols = []string{ - // Inline functions, etc., from the compiler or language - // runtime will naturally end up in the library, to be - // deduplicated against other object files. Such symbols - // should not be prefixed. It is a limitation of this - // symbol-prefixing strategy that we cannot distinguish - // our own inline symbols (which should be prefixed) - // from the system's (which should not), so we blacklist - // known system symbols. - "__local_stdio_printf_options", - "__local_stdio_scanf_options", - "_vscprintf", - "_vscprintf_l", - "_vsscanf_l", - "_xmm", - "sscanf", - "vsnprintf", - // sdallocx is a weak symbol and intended to merge with - // the real one, if present. - "sdallocx", - } - var skip bool - for _, sym := range skipSymbols { - if sym == s { - skip = true - break - } - } - if skip || isCXXSymbol(s) || strings.HasPrefix(s, "__real@") || strings.HasPrefix(s, "__x86.get_pc_thunk.") { - continue - } - if _, err := fmt.Fprintln(out, s); err != nil { - printAndExit("Error writing to %s: %s", *outFlag, err) - } - } -} - -func isCXXSymbol(s string) bool { - if *objFileFormat == ObjFileFormatPE { - return strings.HasPrefix(s, "?") - } - return strings.HasPrefix(s, "_Z") -} - -// listSymbols lists the exported symbols from an object file. -func listSymbols(contents []byte) ([]string, error) { - switch *objFileFormat { - case ObjFileFormatELF: - return listSymbolsELF(contents) - case ObjFileFormatMachO: - return listSymbolsMachO(contents) - case ObjFileFormatPE: - return listSymbolsPE(contents) - default: - return nil, fmt.Errorf("unsupported object file format %q", *objFileFormat) - } -} - -func listSymbolsELF(contents []byte) ([]string, error) { - f, err := elf.NewFile(bytes.NewReader(contents)) - if err != nil { - return nil, err - } - syms, err := f.Symbols() - if err != nil { - return nil, err - } - - var names []string - for _, sym := range syms { - // Only include exported, defined symbols - if elf.ST_BIND(sym.Info) != elf.STB_LOCAL && sym.Section != elf.SHN_UNDEF { - names = append(names, sym.Name) - } - } - return names, nil -} - -func listSymbolsMachO(contents []byte) ([]string, error) { - f, err := macho.NewFile(bytes.NewReader(contents)) - if err != nil { - return nil, err - } - if f.Symtab == nil { - return nil, nil - } - var names []string - for _, sym := range f.Symtab.Syms { - // Source: https://opensource.apple.com/source/xnu/xnu-3789.51.2/EXTERNAL_HEADERS/mach-o/nlist.h.auto.html - const ( - N_PEXT uint8 = 0x10 // Private external symbol bit - N_EXT uint8 = 0x01 // External symbol bit, set for external symbols - N_TYPE uint8 = 0x0e // mask for the type bits - - N_UNDF uint8 = 0x0 // undefined, n_sect == NO_SECT - N_ABS uint8 = 0x2 // absolute, n_sect == NO_SECT - N_SECT uint8 = 0xe // defined in section number n_sect - N_PBUD uint8 = 0xc // prebound undefined (defined in a dylib) - N_INDR uint8 = 0xa // indirect - ) - - // Only include exported, defined symbols. - if sym.Type&N_EXT != 0 && sym.Type&N_TYPE != N_UNDF { - if len(sym.Name) == 0 || sym.Name[0] != '_' { - return nil, fmt.Errorf("unexpected symbol without underscore prefix: %q", sym.Name) - } - names = append(names, sym.Name[1:]) - } - } - return names, nil -} - -func listSymbolsPE(contents []byte) ([]string, error) { - f, err := pe.NewFile(bytes.NewReader(contents)) - if err != nil { - return nil, err - } - var ret []string - for _, sym := range f.Symbols { - const ( - // https://docs.microsoft.com/en-us/windows/desktop/debug/pe-format#section-number-values - IMAGE_SYM_UNDEFINED = 0 - // https://docs.microsoft.com/en-us/windows/desktop/debug/pe-format#storage-class - IMAGE_SYM_CLASS_EXTERNAL = 2 - ) - if sym.SectionNumber != IMAGE_SYM_UNDEFINED && sym.StorageClass == IMAGE_SYM_CLASS_EXTERNAL { - name := sym.Name - if f.Machine == pe.IMAGE_FILE_MACHINE_I386 { - // On 32-bit Windows, C symbols are decorated by calling - // convention. - // https://msdn.microsoft.com/en-us/library/56h2zst2.aspx#FormatC - if strings.HasPrefix(name, "_") || strings.HasPrefix(name, "@") { - // __cdecl, __stdcall, or __fastcall. Remove the prefix and - // suffix, if present. - name = name[1:] - if idx := strings.LastIndex(name, "@"); idx >= 0 { - name = name[:idx] - } - } else if idx := strings.LastIndex(name, "@@"); idx >= 0 { - // __vectorcall. Remove the suffix. - name = name[:idx] - } - } - ret = append(ret, name) - } - } - return ret, nil -}