diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 9e9b0b6..b7e6db9 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -68,6 +68,10 @@ jobs: name: Compile (MacOS clang, aarch64) needs: [ dist ] runs-on: macOS-latest + strategy: + fail-fast: false + matrix: + feature: [ enable-aarch64, disable-aarch64 ] steps: - name: Download source package artifact uses: actions/download-artifact@v4 @@ -76,7 +80,7 @@ jobs: - name: Extract source package run: tar --strip-components=1 -xf xpar-${{ github.sha }}.tar.gz - name: Configure - run: ./configure CC=clang + run: ./configure CC=clang --${{ matrix.feature }} - name: Make run: make all self-check diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 4b3914e..e2ee976 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -52,8 +52,8 @@ jobs: fail-fast: false matrix: target: - - [ "x86_64-linux", "--enable-static --enable-lto", "" ] - - [ "x86_64", "CC=x86_64-w64-mingw32-gcc --host x86_64-w64-mingw32 --enable-static --enable-lto", "gcc-mingw-w64-x86-64" ] + - [ "x86_64-linux", "--enable-static --enable-lto --enable-x86-64", "" ] + - [ "x86_64", "CC=x86_64-w64-mingw32-gcc --host x86_64-w64-mingw32 --enable-static --enable-lto --enable-x86-64", "gcc-mingw-w64-x86-64" ] - [ "i686", "CC=i686-w64-mingw32-gcc --host i686-w64-mingw32 --enable-static --enable-lto", "gcc-mingw-w64-i686" ] steps: - name: Download source package artifact diff --git a/Makefile.am b/Makefile.am index ced1432..9206978 100644 --- a/Makefile.am +++ b/Makefile.am @@ -10,6 +10,10 @@ SUFFIXES = .asm $(NASM) $(NAFLAGS) -o $@ $< endif +if XPAR_AARCH64 +xpar_SOURCES += xpar-aarch64.S +endif + # Developer convenience targets .PHONY: update-ChangeLog update-ChangeLog: diff --git a/README.md b/README.md index 12e11c2..fc8735f 100644 --- a/README.md +++ b/README.md @@ -38,14 +38,11 @@ A rough outline of some development-related topics below. ## Roadmap - Need to implement the parallel variant. -- Need to provide automatic testing for all configurations. - Write a proper readme, manpages, etc. -- Make sure that this builds on Windows and port over the assembly code. -- Fuzz to find segfaults. -- Apple M1-specific optimizations to CRC32. - 32- vs 64-bit code: determine if there's any compatibility issues. Low priority: +- Speed up the joint mode encoder loop (basically a LFSR). - Should probably not pad to the full interlacing block size. - Add assembly routines for hot spots in the program. @@ -61,8 +58,3 @@ As it stands: Code style: - Two space indent, brace on the same line, middle pointers - `char * p;`. - -## Show-stoppers - -clang bug (msys2 packages repository ticket 4958) - makes it impossible to -build on x86_64 Windows with clang. Only gcc is supported. diff --git a/configure.ac b/configure.ac index 33d4c41..5baaeea 100644 --- a/configure.ac +++ b/configure.ac @@ -15,6 +15,7 @@ AM_INIT_AUTOMAKE([-Wall color-tests]) AC_PROG_INSTALL AC_PROG_MAKE_SET AC_PROG_CC +AM_PROG_AS AC_CHECK_HEADERS([getopt.h io.h]) AC_CHECK_FUNCS([getopt_long asprintf strndup stat _commit _setmode isatty fsync mmap CreateFileMappingA]) @@ -34,6 +35,14 @@ else AM_CONDITIONAL([XPAR_X86_64], [false]) fi +AC_ARG_ENABLE([aarch64], [AS_HELP_STRING([--enable-aarch64], [Enable aarch64 platform specific code.])], [enable_aarch64=$enableval], [enable_aarch64=no]) +if test "x$enable_aarch64" = "xyes"; then + AC_DEFINE([XPAR_AARCH64], [1], [Enable aarch64 platform specific code.]) + AM_CONDITIONAL([XPAR_AARCH64], [true]) +else + AM_CONDITIONAL([XPAR_AARCH64], [false]) +fi + AC_ARG_ENABLE([native], [AS_HELP_STRING([--enable-native], [Enable native platform optimisations.])], [enable_native=$enableval], [enable_native=no]) if test "x$enable_native" = "xyes"; then AX_APPEND_COMPILE_FLAGS([-march=native -mtune=native]) diff --git a/crc32c.c b/crc32c.c index 047d850..fa1776b 100644 --- a/crc32c.c +++ b/crc32c.c @@ -89,26 +89,36 @@ u32 crc32c_tabular(u32 crc, u8 * data, sz length) { return crc; } +typedef u32 (*crc32c_func)(u32, u8 *, sz); + #if defined(XPAR_X86_64) #ifdef HAVE_FUNC_ATTRIBUTE_SYSV_ABI #define EXTERNAL_ABI __attribute__((sysv_abi)) #else #define EXTERNAL_ABI #endif - - typedef u32 (*crc32c_func)(u32, u8 *, sz); + extern EXTERNAL_ABI int crc32c_x86_64_cpuflags(void); extern EXTERNAL_ABI u32 crc32c_small_x86_64_sse42(u32, u8 *, sz); +#elif defined(XPAR_AARCH64) + extern int crc32c_aarch64_cpuflags(void); + extern u32 crc32c_small_aarch64_neon(u32, u8 *, sz); #endif u32 crc32c(u8 * data, sz length) { -#if defined(XPAR_X86_64) static int cpuflags = -1; +#if defined(XPAR_X86_64) if (cpuflags == -1) cpuflags = crc32c_x86_64_cpuflags(); if (cpuflags & 1) return crc32c_small_x86_64_sse42(0xFFFFFFFFL, data, length) ^ 0xFFFFFFFFL; else return crc32c_tabular(0xFFFFFFFFL, data, length) ^ 0xFFFFFFFFL; +#elif defined(XPAR_AARCH64) + if (cpuflags == -1) cpuflags = crc32c_aarch64_cpuflags(); + if (cpuflags) + return crc32c_small_aarch64_neon(0xFFFFFFFFL, data, length) ^ 0xFFFFFFFFL; + else + return crc32c_tabular(0xFFFFFFFFL, data, length) ^ 0xFFFFFFFFL; #else return crc32c_tabular(0xFFFFFFFFL, data, length) ^ 0xFFFFFFFFL; #endif diff --git a/xpar-aarch64.S b/xpar-aarch64.S new file mode 100644 index 0000000..eed513c --- /dev/null +++ b/xpar-aarch64.S @@ -0,0 +1,120 @@ +/* + Copyright (C) 2022-2024 Kamila Szewczyk + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +.text +.extern getauxval + +#if defined(__APPLE__) +; aarch64 MacOS +.extern _sysctlbyname +.globl _crc32c_aarch64_cpuflags +_crc32c_aarch64_cpuflags: + sub sp, sp, #32 + stp x29, x30, [sp, #16] + add x29, sp, #16 + mov w8, #4 + adrp x0, .name@GOTPAGE + ldr x0, [x0, .name@GOTPAGEOFF] + sub x1, x29, #4 + mov x2, sp + mov x3, xzr + mov x4, xzr + stur wzr, [x29, #-4] + str x8, [sp] + bl _sysctlbyname + ldur w8, [x29, #-4] + cmp w0, #0 + ccmp w8, #0, #4, eq + cset w0, ne + ldp x29, x30, [sp, #16] + add sp, sp, #32 + ret +.name: .asciz "hw.optional.armv8_crc32" +#else +; aarch64 Linux +.extern getauxval +.globl crc32c_aarch64_cpuflags +crc32c_aarch64_cpuflags: + stp x29, x30, [sp, -16]! + mov x0, 16 ; AT_HWCAP + mov x29, sp + bl getauxval + ldp x29, x30, [sp], 16 + and w0, w0, 128 + ret +#endif + +/* Uses fundamentally the same algorithm as crc32c_small_x86_64_sse42 */ + +#if defined(__APPLE__) +.globl _crc32c_small_aarch64_neon +_crc32c_small_aarch64_neon: +#else +.globl crc32c_small_aarch64_neon +crc32c_small_aarch64_neon: +#endif + cmp x2, 63 + bls .fallback_1way + sub x8, x2, #64 + mov x3, x1 + and x7, x8, -64 + add x7, x7, 64 + add x7, x1, x7 +.crc32c_8way_quad: + ldp x4, x6, [x3] + crc32cx w0, w0, x4 + ldp x4, x5, [x3, 16] + crc32cx w0, w0, x6 + crc32cx w0, w0, x4 + ldp x4, x6, [x3, 32] + crc32cx w0, w0, x5 + crc32cx w0, w0, x4 + ldp x5, x4, [x3, 48] + crc32cx w0, w0, x6 + add x3, x3, 64 + crc32cx w0, w0, x5 + crc32cx w0, w0, x4 + cmp x3, x7 + bne .crc32c_8way_quad + and x8, x8, -64 + add x1, x1, 64 + add x1, x8, x1 + and x2, x2, 63 +.fallback_1way: + cmp x2, 7 + bls .fallback_1way_byte + sub x6, x2, #8 + mov x3, x1 + lsr x5, x6, 3 + add w5, w5, 1 + add x5, x1, x5, lsl 3 +.crc32c_1way_quad: + ldr x4, [x3], 8 + crc32cx w0, w0, x4 + cmp x3, x5 + bne .crc32c_1way_quad + and x6, x6, -8 + add x1, x1, 8 + add x1, x6, x1 + and x2, x2, 7 +.fallback_1way_byte: + cbz x2, .crc32c_done + add x2, x1, x2 +.crc32c_1way_byte: + ldrb w3, [x1], 1 + crc32cb w0, w0, w3 + cmp x1, x2 + bne .crc32c_1way_byte +.crc32c_done: + ret \ No newline at end of file