diff --git a/CMakeLists.txt b/CMakeLists.txt index e39aeb5108..d6577127ce 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -109,7 +109,6 @@ check_include_file("byteswap.h" HAVE_BYTESWAP_H) check_include_file("inttypes.h" HAVE_INTTYPES_H) check_include_file("stdint.h" HAVE_STDINT_H) check_include_file("stdbool.h" HAVE_STDBOOL_H) -check_include_file("arm_neon.h" FLAC__HAS_NEONINTRIN) check_include_file("threads.h" HAVE_THREADS_H) if(MSVC AND CMAKE_C_STANDARD GREATER_EQUAL 11) @@ -128,13 +127,16 @@ if(NOT HAVE_STDINT_H OR NOT HAVE_STDBOOL_H) message(SEND_ERROR "Header stdint.h and/or stdbool.h not found") endif() -if(MSVC) - check_include_file("intrin.h" FLAC__HAS_X86INTRIN) -else() - check_include_file("x86intrin.h" FLAC__HAS_X86INTRIN) +check_include_file("arm_neon.h" FLAC__HAS_NEONINTRIN) +if(!FLAC__HAS_NEONINTRIN) + # Check for x86 after checking for ARM - treat ARM64EC as ARM not x86 + if(MSVC) + check_include_file("intrin.h" FLAC__HAS_X86INTRIN) + else() + check_include_file("x86intrin.h" FLAC__HAS_X86INTRIN) + endif() endif() - if(ANDROID AND CMAKE_SYSTEM_PROCESSOR MATCHES "armv7-a|i686") # fseeko/ftello may link, but it's not usable before Android API 24 on 32-bit Android # https://android.googlesource.com/platform/bionic/+/main/docs/32-bit-abi.md diff --git a/cmake/CheckCPUArch.cmake b/cmake/CheckCPUArch.cmake index 665fa61501..3ca8912ffb 100644 --- a/cmake/CheckCPUArch.cmake +++ b/cmake/CheckCPUArch.cmake @@ -23,5 +23,5 @@ macro(CHECK_CPU_ARCH_X86 VARIABLE) endmacro(CHECK_CPU_ARCH_X86) macro(CHECK_CPU_ARCH_ARM64 VARIABLE) - _CHECK_CPU_ARCH(arm64 "defined(__aarch64__) || defined(__arm64__)" ${VARIABLE}) + _CHECK_CPU_ARCH(arm64 "defined(__aarch64__) || defined(__arm64__) || defined(_M_ARM64) || defined(_M_ARM64EC)" ${VARIABLE}) endmacro(CHECK_CPU_ARCH_ARM64) diff --git a/src/libFLAC/CMakeLists.txt b/src/libFLAC/CMakeLists.txt index 4336737f5b..eee142ebfc 100644 --- a/src/libFLAC/CMakeLists.txt +++ b/src/libFLAC/CMakeLists.txt @@ -10,9 +10,15 @@ include(CheckCSourceCompiles) include(CheckCPUArch) include(CheckA64NEON) -check_cpu_arch_x64(FLAC__CPU_X86_64) -if(NOT FLAC__CPU_X86_64) - check_cpu_arch_x86(FLAC__CPU_IA32) +check_cpu_arch_arm64(FLAC__CPU_ARM64) +if(FLAC__CPU_ARM64) + check_a64neon(FLAC__HAS_A64NEONINTRIN) +else() + # Check for x86 after checking for ARM - treat ARM64EC as ARM not x86 + check_cpu_arch_x64(FLAC__CPU_X86_64) + if(NOT FLAC__CPU_X86_64) + check_cpu_arch_x86(FLAC__CPU_IA32) + endif() endif() if(FLAC__CPU_X86_64 OR FLAC__CPU_IA32) @@ -21,13 +27,9 @@ if(FLAC__CPU_X86_64 OR FLAC__CPU_IA32) if(WITH_AVX AND MSVC) set_source_files_properties(fixed_intrin_avx2.c lpc_intrin_avx2.c stream_encoder_intrin_avx2.c lpc_intrin_fma.c PROPERTIES COMPILE_FLAGS /arch:AVX2) endif() -else() - check_cpu_arch_arm64(FLAC__CPU_ARM64) - if(FLAC__CPU_ARM64) - check_a64neon(FLAC__HAS_A64NEONINTRIN) - endif() endif() + if(NOT WITH_ASM) add_definitions(-DFLAC__NO_ASM) endif() diff --git a/src/libFLAC/bitwriter.c b/src/libFLAC/bitwriter.c index a6e2e115e7..c8b1cb05d8 100644 --- a/src/libFLAC/bitwriter.c +++ b/src/libFLAC/bitwriter.c @@ -112,6 +112,9 @@ FLAC__bool bitwriter_grow_(FLAC__BitWriter *bw, uint32_t bits_to_add) uint32_t new_capacity; bwword *new_buffer; + /* Refuse to allocate more words than this amount, based on largest possible metadata chunk size */ + const uint32_t max_capacity = (((1u << FLAC__STREAM_METADATA_LENGTH_LEN) - 1) * 8 + FLAC__STREAM_METADATA_LENGTH_LEN + FLAC__BITS_PER_WORD - 1) / FLAC__BITS_PER_WORD; + FLAC__ASSERT(0 != bw); FLAC__ASSERT(0 != bw->buffer); @@ -124,7 +127,7 @@ FLAC__bool bitwriter_grow_(FLAC__BitWriter *bw, uint32_t bits_to_add) if(bw->capacity >= new_capacity) return true; - if(new_capacity * sizeof(bwword) > (1u << FLAC__STREAM_METADATA_LENGTH_LEN)) + if(new_capacity > max_capacity) /* Requested new capacity is larger than the largest possible metadata block, * which is also larger than the largest sane framesize. That means something * went very wrong somewhere and previous checks failed. diff --git a/src/libFLAC/lpc_intrin_neon.c b/src/libFLAC/lpc_intrin_neon.c index b722046123..41f4971fbb 100644 --- a/src/libFLAC/lpc_intrin_neon.c +++ b/src/libFLAC/lpc_intrin_neon.c @@ -65,6 +65,14 @@ void FLAC__lpc_compute_autocorrelation_intrin_neon_lag_8(const FLAC__real data[] #endif /* ifdef FLAC__HAS_A64NEONINTRIN */ +static inline int32x4_t load_int32x4(int32_t a, int32_t b, int32_t c, int32_t d) { +#ifdef _MSC_VER // MSVC does not support aggregate initializer of Neon types + int32_t temp[] = {a,b,c,d}; + return vld1q_s32(temp); +#else + return {a,b,c,d}; +#endif +} #define MUL_32_BIT_LOOP_UNROOL_3(qlp_coeff_vec, lane) \ summ_0 = vmulq_laneq_s32(tmp_vec[0], qlp_coeff_vec, lane); \ @@ -91,9 +99,9 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_neon(const FLAC__in if(order > 8) { if(order > 10) { if (order == 12) { - int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]}; - int32x4_t qlp_coeff_1 = {qlp_coeff[4], qlp_coeff[5], qlp_coeff[6], qlp_coeff[7]}; - int32x4_t qlp_coeff_2 = {qlp_coeff[8], qlp_coeff[9], qlp_coeff[10], qlp_coeff[11]}; + int32x4_t qlp_coeff_0 = vld1q_s32(qlp_coeff); + int32x4_t qlp_coeff_1 = vld1q_s32(qlp_coeff + 4); + int32x4_t qlp_coeff_2 = vld1q_s32(qlp_coeff + 8); tmp_vec[0] = vld1q_s32(data - 12); tmp_vec[1] = vld1q_s32(data - 11); @@ -150,9 +158,9 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_neon(const FLAC__in } else { /* order == 11 */ - int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]}; - int32x4_t qlp_coeff_1 = {qlp_coeff[4], qlp_coeff[5], qlp_coeff[6], qlp_coeff[7]}; - int32x4_t qlp_coeff_2 = {qlp_coeff[8], qlp_coeff[9], qlp_coeff[10], 0}; + int32x4_t qlp_coeff_0 = vld1q_s32(qlp_coeff); + int32x4_t qlp_coeff_1 = vld1q_s32(qlp_coeff + 4); + int32x4_t qlp_coeff_2 = load_int32x4(qlp_coeff[8], qlp_coeff[9], qlp_coeff[10], 0); tmp_vec[0] = vld1q_s32(data - 11); tmp_vec[1] = vld1q_s32(data - 10); @@ -208,9 +216,9 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_neon(const FLAC__in } else { if(order == 10) { - int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]}; - int32x4_t qlp_coeff_1 = {qlp_coeff[4], qlp_coeff[5], qlp_coeff[6], qlp_coeff[7]}; - int32x4_t qlp_coeff_2 = {qlp_coeff[8], qlp_coeff[9], 0, 0}; + int32x4_t qlp_coeff_0 = vld1q_s32(qlp_coeff); + int32x4_t qlp_coeff_1 = vld1q_s32(qlp_coeff + 4); + int32x4_t qlp_coeff_2 = load_int32x4(qlp_coeff[8], qlp_coeff[9], 0, 0); tmp_vec[0] = vld1q_s32(data - 10); tmp_vec[1] = vld1q_s32(data - 9); @@ -261,9 +269,9 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_neon(const FLAC__in } } else { /* order == 9 */ - int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]}; - int32x4_t qlp_coeff_1 = {qlp_coeff[4], qlp_coeff[5], qlp_coeff[6], qlp_coeff[7]}; - int32x4_t qlp_coeff_2 = {qlp_coeff[8], 0, 0, 0}; + int32x4_t qlp_coeff_0 = vld1q_s32(qlp_coeff); + int32x4_t qlp_coeff_1 = vld1q_s32(qlp_coeff + 4); + int32x4_t qlp_coeff_2 = load_int32x4(qlp_coeff[8], 0, 0, 0); tmp_vec[0] = vld1q_s32(data - 9); tmp_vec[1] = vld1q_s32(data - 8); @@ -313,8 +321,8 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_neon(const FLAC__in else if(order > 4) { if(order > 6) { if(order == 8) { - int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]}; - int32x4_t qlp_coeff_1 = {qlp_coeff[4], qlp_coeff[5], qlp_coeff[6], qlp_coeff[7]}; + int32x4_t qlp_coeff_0 = vld1q_s32(qlp_coeff); + int32x4_t qlp_coeff_1 = vld1q_s32(qlp_coeff + 4); tmp_vec[0] = vld1q_s32(data - 8); tmp_vec[1] = vld1q_s32(data - 7); @@ -357,8 +365,8 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_neon(const FLAC__in } } else { /* order == 7 */ - int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]}; - int32x4_t qlp_coeff_1 = {qlp_coeff[4], qlp_coeff[5], qlp_coeff[6], 0}; + int32x4_t qlp_coeff_0 = vld1q_s32(qlp_coeff); + int32x4_t qlp_coeff_1 = load_int32x4(qlp_coeff[4], qlp_coeff[5], qlp_coeff[6], 0); tmp_vec[0] = vld1q_s32(data - 7); tmp_vec[1] = vld1q_s32(data - 6); @@ -400,8 +408,8 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_neon(const FLAC__in } else { if(order == 6) { - int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]}; - int32x4_t qlp_coeff_1 = {qlp_coeff[4], qlp_coeff[5], 0, 0}; + int32x4_t qlp_coeff_0 = vld1q_s32(qlp_coeff); + int32x4_t qlp_coeff_1 = load_int32x4(qlp_coeff[4], qlp_coeff[5], 0, 0); tmp_vec[0] = vld1q_s32(data - 6); tmp_vec[1] = vld1q_s32(data - 5); @@ -438,8 +446,8 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_neon(const FLAC__in } } else { /* order == 5 */ - int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]}; - int32x4_t qlp_coeff_1 = {qlp_coeff[4], 0, 0, 0}; + int32x4_t qlp_coeff_0 = vld1q_s32(qlp_coeff); + int32x4_t qlp_coeff_1 = load_int32x4(qlp_coeff[4], 0, 0, 0); tmp_vec[0] = vld1q_s32(data - 5); @@ -478,7 +486,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_neon(const FLAC__in else { if(order > 2) { if(order == 4) { - int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]}; + int32x4_t qlp_coeff_0 = vld1q_s32(qlp_coeff); for (i = 0; i < (int)data_len - 11; i += 12) { @@ -507,7 +515,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_neon(const FLAC__in } } else { /* order == 3 */ - int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], 0}; + int32x4_t qlp_coeff_0 = load_int32x4(qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], 0); for (i = 0; i < (int)data_len - 11; i += 12) { @@ -534,7 +542,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_neon(const FLAC__in } else { if(order == 2) { - int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], 0, 0}; + int32x4_t qlp_coeff_0 = load_int32x4(qlp_coeff[0], qlp_coeff[1], 0, 0); for (i = 0; i < (int)data_len - 11; i += 12) { @@ -679,9 +687,9 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_neon(const FLA if(order > 8) { if(order > 10) { if(order == 12) { - int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]}; - int32x4_t qlp_coeff_1 = {qlp_coeff[4],qlp_coeff[5],qlp_coeff[6],qlp_coeff[7]}; - int32x4_t qlp_coeff_2 = {qlp_coeff[8],qlp_coeff[9],qlp_coeff[10],qlp_coeff[11]}; + int32x4_t qlp_coeff_0 = vld1q_s32(qlp_coeff); + int32x4_t qlp_coeff_1 = vld1q_s32(qlp_coeff + 4); + int32x4_t qlp_coeff_2 = vld1q_s32(qlp_coeff + 8); tmp_vec[0] = vld1q_s32(data - 12); tmp_vec[1] = vld1q_s32(data - 11); @@ -735,9 +743,9 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_neon(const FLA } } else { /* order == 11 */ - int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]}; - int32x4_t qlp_coeff_1 = {qlp_coeff[4],qlp_coeff[5],qlp_coeff[6],qlp_coeff[7]}; - int32x4_t qlp_coeff_2 = {qlp_coeff[8],qlp_coeff[9],qlp_coeff[10],0}; + int32x4_t qlp_coeff_0 = vld1q_s32(qlp_coeff); + int32x4_t qlp_coeff_1 = vld1q_s32(qlp_coeff + 4); + int32x4_t qlp_coeff_2 = load_int32x4(qlp_coeff[8],qlp_coeff[9],qlp_coeff[10],0); tmp_vec[0] = vld1q_s32(data - 11); tmp_vec[1] = vld1q_s32(data - 10); @@ -791,9 +799,9 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_neon(const FLA else { if (order == 10) { - int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]}; - int32x4_t qlp_coeff_1 = {qlp_coeff[4], qlp_coeff[5], qlp_coeff[6], qlp_coeff[7]}; - int32x4_t qlp_coeff_2 = {qlp_coeff[8], qlp_coeff[9], 0, 0}; + int32x4_t qlp_coeff_0 = vld1q_s32(qlp_coeff); + int32x4_t qlp_coeff_1 = vld1q_s32(qlp_coeff + 4); + int32x4_t qlp_coeff_2 = load_int32x4(qlp_coeff[8], qlp_coeff[9], 0, 0); tmp_vec[0] = vld1q_s32(data - 10); tmp_vec[1] = vld1q_s32(data - 9); @@ -843,9 +851,9 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_neon(const FLA } else /* order == 9 */ { - int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]}; - int32x4_t qlp_coeff_1 = {qlp_coeff[4], qlp_coeff[5], qlp_coeff[6], qlp_coeff[7]}; - int32x4_t qlp_coeff_2 = {qlp_coeff[8], 0, 0, 0}; + int32x4_t qlp_coeff_0 = vld1q_s32(qlp_coeff); + int32x4_t qlp_coeff_1 = vld1q_s32(qlp_coeff + 4); + int32x4_t qlp_coeff_2 = load_int32x4(qlp_coeff[8], 0, 0, 0); tmp_vec[0] = vld1q_s32(data - 9); tmp_vec[1] = vld1q_s32(data - 8); @@ -897,8 +905,8 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_neon(const FLA { if (order == 8) { - int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]}; - int32x4_t qlp_coeff_1 = {qlp_coeff[4], qlp_coeff[5], qlp_coeff[6], qlp_coeff[7]}; + int32x4_t qlp_coeff_0 = vld1q_s32(qlp_coeff); + int32x4_t qlp_coeff_1 = vld1q_s32(qlp_coeff + 4); tmp_vec[0] = vld1q_s32(data - 8); tmp_vec[1] = vld1q_s32(data - 7); @@ -942,8 +950,8 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_neon(const FLA } else /* order == 7 */ { - int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]}; - int32x4_t qlp_coeff_1 = {qlp_coeff[4], qlp_coeff[5], qlp_coeff[6], 0}; + int32x4_t qlp_coeff_0 = vld1q_s32(qlp_coeff); + int32x4_t qlp_coeff_1 = load_int32x4(qlp_coeff[4], qlp_coeff[5], qlp_coeff[6], 0); tmp_vec[0] = vld1q_s32(data - 7); tmp_vec[1] = vld1q_s32(data - 6); @@ -986,8 +994,8 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_neon(const FLA else { if (order == 6) { - int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]}; - int32x4_t qlp_coeff_1 = {qlp_coeff[4], qlp_coeff[5], 0, 0}; + int32x4_t qlp_coeff_0 = vld1q_s32(qlp_coeff); + int32x4_t qlp_coeff_1 = load_int32x4(qlp_coeff[4], qlp_coeff[5], 0, 0); tmp_vec[0] = vld1q_s32(data - 6); tmp_vec[1] = vld1q_s32(data - 5); @@ -1026,8 +1034,8 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_neon(const FLA else { /* order == 5 */ - int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]}; - int32x4_t qlp_coeff_1 = {qlp_coeff[4], 0, 0, 0}; + int32x4_t qlp_coeff_0 = vld1q_s32(qlp_coeff); + int32x4_t qlp_coeff_1 = load_int32x4(qlp_coeff[4], 0, 0, 0); tmp_vec[0] = vld1q_s32(data - 5); @@ -1066,7 +1074,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_neon(const FLA { if (order == 4) { - int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]}; + int32x4_t qlp_coeff_0 = vld1q_s32(qlp_coeff); for (i = 0; i < (int)data_len - 11; i += 12) { @@ -1095,7 +1103,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_neon(const FLA else { /* order == 3 */ - int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], 0}; + int32x4_t qlp_coeff_0 = load_int32x4(qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], 0); for (i = 0; i < (int)data_len - 11; i += 12) { @@ -1122,7 +1130,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_neon(const FLA { if (order == 2) { - int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], 0, 0}; + int32x4_t qlp_coeff_0 = load_int32x4(qlp_coeff[0], qlp_coeff[1], 0, 0); for (i = 0; i < (int)data_len - 11; i += 12) {