Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 8 additions & 6 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,6 @@ check_include_file("byteswap.h" HAVE_BYTESWAP_H)
check_include_file("inttypes.h" HAVE_INTTYPES_H)
check_include_file("stdint.h" HAVE_STDINT_H)
check_include_file("stdbool.h" HAVE_STDBOOL_H)
check_include_file("arm_neon.h" FLAC__HAS_NEONINTRIN)

check_include_file("threads.h" HAVE_THREADS_H)
if(MSVC AND CMAKE_C_STANDARD GREATER_EQUAL 11)
Expand All @@ -128,13 +127,16 @@ if(NOT HAVE_STDINT_H OR NOT HAVE_STDBOOL_H)
message(SEND_ERROR "Header stdint.h and/or stdbool.h not found")
endif()

if(MSVC)
check_include_file("intrin.h" FLAC__HAS_X86INTRIN)
else()
check_include_file("x86intrin.h" FLAC__HAS_X86INTRIN)
check_include_file("arm_neon.h" FLAC__HAS_NEONINTRIN)
if(!FLAC__HAS_NEONINTRIN)
# Check for x86 after checking for ARM - treat ARM64EC as ARM not x86
if(MSVC)
check_include_file("intrin.h" FLAC__HAS_X86INTRIN)
else()
check_include_file("x86intrin.h" FLAC__HAS_X86INTRIN)
endif()
endif()


if(ANDROID AND CMAKE_SYSTEM_PROCESSOR MATCHES "armv7-a|i686")
# fseeko/ftello may link, but it's not usable before Android API 24 on 32-bit Android
# https://android.googlesource.com/platform/bionic/+/main/docs/32-bit-abi.md
Expand Down
2 changes: 1 addition & 1 deletion cmake/CheckCPUArch.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -23,5 +23,5 @@ macro(CHECK_CPU_ARCH_X86 VARIABLE)
endmacro(CHECK_CPU_ARCH_X86)

macro(CHECK_CPU_ARCH_ARM64 VARIABLE)
_CHECK_CPU_ARCH(arm64 "defined(__aarch64__) || defined(__arm64__)" ${VARIABLE})
_CHECK_CPU_ARCH(arm64 "defined(__aarch64__) || defined(__arm64__) || defined(_M_ARM64) || defined(_M_ARM64EC)" ${VARIABLE})
endmacro(CHECK_CPU_ARCH_ARM64)
18 changes: 10 additions & 8 deletions src/libFLAC/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,15 @@ include(CheckCSourceCompiles)
include(CheckCPUArch)
include(CheckA64NEON)

check_cpu_arch_x64(FLAC__CPU_X86_64)
if(NOT FLAC__CPU_X86_64)
check_cpu_arch_x86(FLAC__CPU_IA32)
check_cpu_arch_arm64(FLAC__CPU_ARM64)
if(FLAC__CPU_ARM64)
check_a64neon(FLAC__HAS_A64NEONINTRIN)
else()
# Check for x86 after checking for ARM - treat ARM64EC as ARM not x86
check_cpu_arch_x64(FLAC__CPU_X86_64)
if(NOT FLAC__CPU_X86_64)
check_cpu_arch_x86(FLAC__CPU_IA32)
endif()
endif()

if(FLAC__CPU_X86_64 OR FLAC__CPU_IA32)
Expand All @@ -21,13 +27,9 @@ if(FLAC__CPU_X86_64 OR FLAC__CPU_IA32)
if(WITH_AVX AND MSVC)
set_source_files_properties(fixed_intrin_avx2.c lpc_intrin_avx2.c stream_encoder_intrin_avx2.c lpc_intrin_fma.c PROPERTIES COMPILE_FLAGS /arch:AVX2)
endif()
else()
check_cpu_arch_arm64(FLAC__CPU_ARM64)
if(FLAC__CPU_ARM64)
check_a64neon(FLAC__HAS_A64NEONINTRIN)
endif()
endif()


if(NOT WITH_ASM)
add_definitions(-DFLAC__NO_ASM)
endif()
Expand Down
5 changes: 4 additions & 1 deletion src/libFLAC/bitwriter.c
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,9 @@ FLAC__bool bitwriter_grow_(FLAC__BitWriter *bw, uint32_t bits_to_add)
uint32_t new_capacity;
bwword *new_buffer;

/* Refuse to allocate more words than this amount, based on largest possible metadata chunk size */
const uint32_t max_capacity = (((1u << FLAC__STREAM_METADATA_LENGTH_LEN) - 1) * 8 + FLAC__STREAM_METADATA_LENGTH_LEN + FLAC__BITS_PER_WORD - 1) / FLAC__BITS_PER_WORD;

FLAC__ASSERT(0 != bw);
FLAC__ASSERT(0 != bw->buffer);

Expand All @@ -124,7 +127,7 @@ FLAC__bool bitwriter_grow_(FLAC__BitWriter *bw, uint32_t bits_to_add)
if(bw->capacity >= new_capacity)
return true;

if(new_capacity * sizeof(bwword) > (1u << FLAC__STREAM_METADATA_LENGTH_LEN))
if(new_capacity > max_capacity)
/* Requested new capacity is larger than the largest possible metadata block,
* which is also larger than the largest sane framesize. That means something
* went very wrong somewhere and previous checks failed.
Expand Down
100 changes: 54 additions & 46 deletions src/libFLAC/lpc_intrin_neon.c
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,14 @@ void FLAC__lpc_compute_autocorrelation_intrin_neon_lag_8(const FLAC__real data[]

#endif /* ifdef FLAC__HAS_A64NEONINTRIN */

static inline int32x4_t load_int32x4(int32_t a, int32_t b, int32_t c, int32_t d) {
#ifdef _MSC_VER // MSVC does not support aggregate initializer of Neon types
int32_t temp[] = {a,b,c,d};
return vld1q_s32(temp);
#else
return {a,b,c,d};
#endif
}

#define MUL_32_BIT_LOOP_UNROOL_3(qlp_coeff_vec, lane) \
summ_0 = vmulq_laneq_s32(tmp_vec[0], qlp_coeff_vec, lane); \
Expand All @@ -91,9 +99,9 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_neon(const FLAC__in
if(order > 8) {
if(order > 10) {
if (order == 12) {
int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]};
int32x4_t qlp_coeff_1 = {qlp_coeff[4], qlp_coeff[5], qlp_coeff[6], qlp_coeff[7]};
int32x4_t qlp_coeff_2 = {qlp_coeff[8], qlp_coeff[9], qlp_coeff[10], qlp_coeff[11]};
int32x4_t qlp_coeff_0 = vld1q_s32(qlp_coeff);
int32x4_t qlp_coeff_1 = vld1q_s32(qlp_coeff + 4);
int32x4_t qlp_coeff_2 = vld1q_s32(qlp_coeff + 8);

tmp_vec[0] = vld1q_s32(data - 12);
tmp_vec[1] = vld1q_s32(data - 11);
Expand Down Expand Up @@ -150,9 +158,9 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_neon(const FLAC__in
}

else { /* order == 11 */
int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]};
int32x4_t qlp_coeff_1 = {qlp_coeff[4], qlp_coeff[5], qlp_coeff[6], qlp_coeff[7]};
int32x4_t qlp_coeff_2 = {qlp_coeff[8], qlp_coeff[9], qlp_coeff[10], 0};
int32x4_t qlp_coeff_0 = vld1q_s32(qlp_coeff);
int32x4_t qlp_coeff_1 = vld1q_s32(qlp_coeff + 4);
int32x4_t qlp_coeff_2 = load_int32x4(qlp_coeff[8], qlp_coeff[9], qlp_coeff[10], 0);

tmp_vec[0] = vld1q_s32(data - 11);
tmp_vec[1] = vld1q_s32(data - 10);
Expand Down Expand Up @@ -208,9 +216,9 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_neon(const FLAC__in
}
else {
if(order == 10) {
int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]};
int32x4_t qlp_coeff_1 = {qlp_coeff[4], qlp_coeff[5], qlp_coeff[6], qlp_coeff[7]};
int32x4_t qlp_coeff_2 = {qlp_coeff[8], qlp_coeff[9], 0, 0};
int32x4_t qlp_coeff_0 = vld1q_s32(qlp_coeff);
int32x4_t qlp_coeff_1 = vld1q_s32(qlp_coeff + 4);
int32x4_t qlp_coeff_2 = load_int32x4(qlp_coeff[8], qlp_coeff[9], 0, 0);

tmp_vec[0] = vld1q_s32(data - 10);
tmp_vec[1] = vld1q_s32(data - 9);
Expand Down Expand Up @@ -261,9 +269,9 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_neon(const FLAC__in
}
}
else { /* order == 9 */
int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]};
int32x4_t qlp_coeff_1 = {qlp_coeff[4], qlp_coeff[5], qlp_coeff[6], qlp_coeff[7]};
int32x4_t qlp_coeff_2 = {qlp_coeff[8], 0, 0, 0};
int32x4_t qlp_coeff_0 = vld1q_s32(qlp_coeff);
int32x4_t qlp_coeff_1 = vld1q_s32(qlp_coeff + 4);
int32x4_t qlp_coeff_2 = load_int32x4(qlp_coeff[8], 0, 0, 0);

tmp_vec[0] = vld1q_s32(data - 9);
tmp_vec[1] = vld1q_s32(data - 8);
Expand Down Expand Up @@ -313,8 +321,8 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_neon(const FLAC__in
else if(order > 4) {
if(order > 6) {
if(order == 8) {
int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]};
int32x4_t qlp_coeff_1 = {qlp_coeff[4], qlp_coeff[5], qlp_coeff[6], qlp_coeff[7]};
int32x4_t qlp_coeff_0 = vld1q_s32(qlp_coeff);
int32x4_t qlp_coeff_1 = vld1q_s32(qlp_coeff + 4);

tmp_vec[0] = vld1q_s32(data - 8);
tmp_vec[1] = vld1q_s32(data - 7);
Expand Down Expand Up @@ -357,8 +365,8 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_neon(const FLAC__in
}
}
else { /* order == 7 */
int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]};
int32x4_t qlp_coeff_1 = {qlp_coeff[4], qlp_coeff[5], qlp_coeff[6], 0};
int32x4_t qlp_coeff_0 = vld1q_s32(qlp_coeff);
int32x4_t qlp_coeff_1 = load_int32x4(qlp_coeff[4], qlp_coeff[5], qlp_coeff[6], 0);

tmp_vec[0] = vld1q_s32(data - 7);
tmp_vec[1] = vld1q_s32(data - 6);
Expand Down Expand Up @@ -400,8 +408,8 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_neon(const FLAC__in
}
else {
if(order == 6) {
int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]};
int32x4_t qlp_coeff_1 = {qlp_coeff[4], qlp_coeff[5], 0, 0};
int32x4_t qlp_coeff_0 = vld1q_s32(qlp_coeff);
int32x4_t qlp_coeff_1 = load_int32x4(qlp_coeff[4], qlp_coeff[5], 0, 0);

tmp_vec[0] = vld1q_s32(data - 6);
tmp_vec[1] = vld1q_s32(data - 5);
Expand Down Expand Up @@ -438,8 +446,8 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_neon(const FLAC__in
}
}
else { /* order == 5 */
int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]};
int32x4_t qlp_coeff_1 = {qlp_coeff[4], 0, 0, 0};
int32x4_t qlp_coeff_0 = vld1q_s32(qlp_coeff);
int32x4_t qlp_coeff_1 = load_int32x4(qlp_coeff[4], 0, 0, 0);

tmp_vec[0] = vld1q_s32(data - 5);

Expand Down Expand Up @@ -478,7 +486,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_neon(const FLAC__in
else {
if(order > 2) {
if(order == 4) {
int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]};
int32x4_t qlp_coeff_0 = vld1q_s32(qlp_coeff);

for (i = 0; i < (int)data_len - 11; i += 12)
{
Expand Down Expand Up @@ -507,7 +515,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_neon(const FLAC__in
}
}
else { /* order == 3 */
int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], 0};
int32x4_t qlp_coeff_0 = load_int32x4(qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], 0);

for (i = 0; i < (int)data_len - 11; i += 12)
{
Expand All @@ -534,7 +542,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_neon(const FLAC__in
}
else {
if(order == 2) {
int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], 0, 0};
int32x4_t qlp_coeff_0 = load_int32x4(qlp_coeff[0], qlp_coeff[1], 0, 0);

for (i = 0; i < (int)data_len - 11; i += 12)
{
Expand Down Expand Up @@ -679,9 +687,9 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_neon(const FLA
if(order > 8) {
if(order > 10) {
if(order == 12) {
int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]};
int32x4_t qlp_coeff_1 = {qlp_coeff[4],qlp_coeff[5],qlp_coeff[6],qlp_coeff[7]};
int32x4_t qlp_coeff_2 = {qlp_coeff[8],qlp_coeff[9],qlp_coeff[10],qlp_coeff[11]};
int32x4_t qlp_coeff_0 = vld1q_s32(qlp_coeff);
int32x4_t qlp_coeff_1 = vld1q_s32(qlp_coeff + 4);
int32x4_t qlp_coeff_2 = vld1q_s32(qlp_coeff + 8);

tmp_vec[0] = vld1q_s32(data - 12);
tmp_vec[1] = vld1q_s32(data - 11);
Expand Down Expand Up @@ -735,9 +743,9 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_neon(const FLA
}
}
else { /* order == 11 */
int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]};
int32x4_t qlp_coeff_1 = {qlp_coeff[4],qlp_coeff[5],qlp_coeff[6],qlp_coeff[7]};
int32x4_t qlp_coeff_2 = {qlp_coeff[8],qlp_coeff[9],qlp_coeff[10],0};
int32x4_t qlp_coeff_0 = vld1q_s32(qlp_coeff);
int32x4_t qlp_coeff_1 = vld1q_s32(qlp_coeff + 4);
int32x4_t qlp_coeff_2 = load_int32x4(qlp_coeff[8],qlp_coeff[9],qlp_coeff[10],0);

tmp_vec[0] = vld1q_s32(data - 11);
tmp_vec[1] = vld1q_s32(data - 10);
Expand Down Expand Up @@ -791,9 +799,9 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_neon(const FLA
else
{
if (order == 10) {
int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]};
int32x4_t qlp_coeff_1 = {qlp_coeff[4], qlp_coeff[5], qlp_coeff[6], qlp_coeff[7]};
int32x4_t qlp_coeff_2 = {qlp_coeff[8], qlp_coeff[9], 0, 0};
int32x4_t qlp_coeff_0 = vld1q_s32(qlp_coeff);
int32x4_t qlp_coeff_1 = vld1q_s32(qlp_coeff + 4);
int32x4_t qlp_coeff_2 = load_int32x4(qlp_coeff[8], qlp_coeff[9], 0, 0);

tmp_vec[0] = vld1q_s32(data - 10);
tmp_vec[1] = vld1q_s32(data - 9);
Expand Down Expand Up @@ -843,9 +851,9 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_neon(const FLA
}

else /* order == 9 */ {
int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]};
int32x4_t qlp_coeff_1 = {qlp_coeff[4], qlp_coeff[5], qlp_coeff[6], qlp_coeff[7]};
int32x4_t qlp_coeff_2 = {qlp_coeff[8], 0, 0, 0};
int32x4_t qlp_coeff_0 = vld1q_s32(qlp_coeff);
int32x4_t qlp_coeff_1 = vld1q_s32(qlp_coeff + 4);
int32x4_t qlp_coeff_2 = load_int32x4(qlp_coeff[8], 0, 0, 0);

tmp_vec[0] = vld1q_s32(data - 9);
tmp_vec[1] = vld1q_s32(data - 8);
Expand Down Expand Up @@ -897,8 +905,8 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_neon(const FLA
{
if (order == 8)
{
int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]};
int32x4_t qlp_coeff_1 = {qlp_coeff[4], qlp_coeff[5], qlp_coeff[6], qlp_coeff[7]};
int32x4_t qlp_coeff_0 = vld1q_s32(qlp_coeff);
int32x4_t qlp_coeff_1 = vld1q_s32(qlp_coeff + 4);

tmp_vec[0] = vld1q_s32(data - 8);
tmp_vec[1] = vld1q_s32(data - 7);
Expand Down Expand Up @@ -942,8 +950,8 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_neon(const FLA
}
else /* order == 7 */
{
int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]};
int32x4_t qlp_coeff_1 = {qlp_coeff[4], qlp_coeff[5], qlp_coeff[6], 0};
int32x4_t qlp_coeff_0 = vld1q_s32(qlp_coeff);
int32x4_t qlp_coeff_1 = load_int32x4(qlp_coeff[4], qlp_coeff[5], qlp_coeff[6], 0);

tmp_vec[0] = vld1q_s32(data - 7);
tmp_vec[1] = vld1q_s32(data - 6);
Expand Down Expand Up @@ -986,8 +994,8 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_neon(const FLA
else
{
if (order == 6) {
int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]};
int32x4_t qlp_coeff_1 = {qlp_coeff[4], qlp_coeff[5], 0, 0};
int32x4_t qlp_coeff_0 = vld1q_s32(qlp_coeff);
int32x4_t qlp_coeff_1 = load_int32x4(qlp_coeff[4], qlp_coeff[5], 0, 0);

tmp_vec[0] = vld1q_s32(data - 6);
tmp_vec[1] = vld1q_s32(data - 5);
Expand Down Expand Up @@ -1026,8 +1034,8 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_neon(const FLA

else
{ /* order == 5 */
int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]};
int32x4_t qlp_coeff_1 = {qlp_coeff[4], 0, 0, 0};
int32x4_t qlp_coeff_0 = vld1q_s32(qlp_coeff);
int32x4_t qlp_coeff_1 = load_int32x4(qlp_coeff[4], 0, 0, 0);

tmp_vec[0] = vld1q_s32(data - 5);

Expand Down Expand Up @@ -1066,7 +1074,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_neon(const FLA
{
if (order == 4)
{
int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]};
int32x4_t qlp_coeff_0 = vld1q_s32(qlp_coeff);

for (i = 0; i < (int)data_len - 11; i += 12)
{
Expand Down Expand Up @@ -1095,7 +1103,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_neon(const FLA
else
{ /* order == 3 */

int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], 0};
int32x4_t qlp_coeff_0 = load_int32x4(qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], 0);

for (i = 0; i < (int)data_len - 11; i += 12)
{
Expand All @@ -1122,7 +1130,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_neon(const FLA
{
if (order == 2)
{
int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], 0, 0};
int32x4_t qlp_coeff_0 = load_int32x4(qlp_coeff[0], qlp_coeff[1], 0, 0);

for (i = 0; i < (int)data_len - 11; i += 12)
{
Expand Down