diff --git a/CMakeLists.txt b/CMakeLists.txt index fb23b7d963..ee6be886ba 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -114,6 +114,13 @@ check_include_file("stdint.h" HAVE_STDINT_H) check_include_file("stdbool.h" HAVE_STDBOOL_H) check_include_file("arm_neon.h" FLAC__HAS_NEONINTRIN) +# Toolchains won't allow riscv_vector.h to be included unless the +# vector extensions are enabled. +set(SAVED_CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS}") +set(CMAKE_REQUIRED_FLAGS "-march=rv64gcv") +check_include_file("riscv_vector.h" HAVE_RISCV_VECTOR_H) +set(CMAKE_REQUIRED_FLAGS "${SAVED_CMAKE_REQUIRED_FLAGS}") + if(NOT HAVE_STDINT_H OR NOT HAVE_STDBOOL_H) message(SEND_ERROR "Header stdint.h and/or stdbool.h not found") endif() diff --git a/Makefile.am b/Makefile.am index 3bb523e880..78c75518b3 100644 --- a/Makefile.am +++ b/Makefile.am @@ -43,6 +43,8 @@ EXTRA_DIST = \ flac-config.cmake.in \ cmake/CheckA64NEON.c.in \ cmake/CheckA64NEON.cmake \ + cmake/CheckRV64Vector.c.in \ + cmake/CheckRV64Vector.cmake \ cmake/CheckCPUArch.c.in \ cmake/CheckCPUArch.cmake \ cmake/FindOgg.cmake \ diff --git a/cmake/CheckCPUArch.cmake b/cmake/CheckCPUArch.cmake index 665fa61501..4e41632481 100644 --- a/cmake/CheckCPUArch.cmake +++ b/cmake/CheckCPUArch.cmake @@ -25,3 +25,7 @@ endmacro(CHECK_CPU_ARCH_X86) macro(CHECK_CPU_ARCH_ARM64 VARIABLE) _CHECK_CPU_ARCH(arm64 "defined(__aarch64__) || defined(__arm64__)" ${VARIABLE}) endmacro(CHECK_CPU_ARCH_ARM64) + +macro(CHECK_CPU_ARCH_RISCV64 VARIABLE) + _CHECK_CPU_ARCH(riscv64 "defined(__riscv)" ${VARIABLE}) +endmacro(CHECK_CPU_ARCH_RISCV64) diff --git a/cmake/CheckRV64Vector.c.in b/cmake/CheckRV64Vector.c.in new file mode 100644 index 0000000000..fc9662305d --- /dev/null +++ b/cmake/CheckRV64Vector.c.in @@ -0,0 +1,7 @@ +#include +int main (void) +{ + size_t vl = __riscv_vsetvl_e64m2(8); + vfloat64m2_t a = __riscv_vfmv_v_f_f64m2(0.5, vl); + return 0; +} diff --git a/cmake/CheckRV64Vector.cmake b/cmake/CheckRV64Vector.cmake new file mode 100644 index 0000000000..c73f65e8f0 --- /dev/null +++ b/cmake/CheckRV64Vector.cmake @@ -0,0 +1,15 @@ +macro(CHECK_RV64VECTOR VARIABLE) + if(NOT DEFINED HAVE_${VARIABLE}) + message(STATUS "Check whether RV64 Vector can be used") + configure_file(${PROJECT_SOURCE_DIR}/cmake/CheckRV64Vector.c.in ${PROJECT_BINARY_DIR}/CMakeFiles/CMakeTmp/CheckRV64Vector.c @ONLY) + try_compile(HAVE_${VARIABLE} "${PROJECT_BINARY_DIR}" + "${PROJECT_BINARY_DIR}/CMakeFiles/CMakeTmp/CheckRV64Vector.c" COMPILE_DEFINITIONS "-march=rv64gcv") + if(HAVE_${VARIABLE}) + message(STATUS "Check whether RV64 Vector can be used - yes") + set(${VARIABLE} 1 CACHE INTERNAL "Result of CHECK_RV64VECTOR" FORCE) + else () + message(STATUS "Check whether RV64 Vector can be used - no") + endif() + endif () +endmacro(CHECK_RV64VECTOR) + diff --git a/config.cmake.h.in b/config.cmake.h.in index acc73f0849..53fa41d0c9 100644 --- a/config.cmake.h.in +++ b/config.cmake.h.in @@ -9,6 +9,9 @@ /* Target processor ARM64 */ #cmakedefine FLAC__CPU_ARM64 +/* Target processor RiscV64 */ +#cmakedefine FLAC__CPU_RISCV64 + /* Set FLAC__BYTES_PER_WORD to 8 (4 is the default) */ #cmakedefine01 ENABLE_64_BIT_WORDS @@ -31,6 +34,9 @@ /* Set to 1 if contains A64 intrinsics */ #cmakedefine01 FLAC__HAS_A64NEONINTRIN +/* Set to 1 if is available. */ +#cmakedefine01 FLAC__HAS_RISCVINTRIN + /* define if building for Darwin / MacOS X */ #cmakedefine FLAC__SYS_DARWIN @@ -103,6 +109,9 @@ /* Define to 1 if you have the header file. */ #cmakedefine HAVE_SYS_PARAM_H +/* Define to 1 if you have the header file. */ +#cmakedefine HAVE_SYS_AUXV_H + /* Define to 1 if you have the header file. */ #cmakedefine HAVE_SYS_STAT_H @@ -112,6 +121,9 @@ /* Define to 1 if you have the header file. */ #cmakedefine HAVE_TERMIOS_H +/* Define to 1 if you have the header file. */ +#cmakedefine HAVE_RISCV_VECTOR_H + /* Define to 1 if typeof works with your compiler. */ #cmakedefine HAVE_TYPEOF diff --git a/configure.ac b/configure.ac index 418677c7a6..06d9dafb68 100644 --- a/configure.ac +++ b/configure.ac @@ -86,6 +86,15 @@ AC_DEFINE(FLAC__NO_ASM) AH_TEMPLATE(FLAC__NO_ASM, [define to disable use of assembly code]) fi +AC_ARG_ENABLE(riscv-vector-optimizations, AS_HELP_STRING([--enable-riscv-vector-optimizations],[Enable RiscV Vector Optimization Routines]), riscv_vector_opt=yes, riscv_vector_opt=no) +AM_CONDITIONAL(FLAC__RISCV_VECTOR, test "x$riscv_vector_opt" = xyes) +if test "x$riscv_vector_opt" = xyes ; then +CFLAGS="-march=rv64gcv $CFLAGS" +AC_CHECK_HEADERS([riscv_vector.h]) +AC_DEFINE(FLAC__RISCV_VECTOR) +AH_TEMPLATE(FLAC__RISCV_VECTOR, [define to enable use riscv vector extensions]) +fi + dnl check for getauxval in standard library AC_CHECK_FUNCS(getauxval) @@ -131,10 +140,16 @@ case "$host_cpu" in AC_DEFINE(FLAC__CPU_ARM64) AH_TEMPLATE(FLAC__CPU_ARM64, [define if building for ARM]) ;; + riscv64) + cpu_riscv64=true + AC_DEFINE(FLAC__CPU_RISCV64) + AH_TEMPLATE(FLAC__CPU_RISCV64, [define if building for Riscv64]) + ;; esac AM_CONDITIONAL(FLAC__CPU_X86_64, test "x$cpu_x86_64" = xtrue) AM_CONDITIONAL(FLaC__CPU_IA32, test "x$cpu_ia32" = xtrue) AM_CONDITIONAL(FLAC__CPU_ARM64, test "x$cpu_arm64" = xtrue) +AM_CONDITIONAL(FLAC__CPU_RISCV64, test "x$cpu_riscv64" = xtrue) if test "x$ac_cv_header_x86intrin_h" = xyes -a "x$asm_opt" = xyes; then AC_DEFINE([FLAC__HAS_X86INTRIN], 1, [Set to 1 if is available.]) @@ -162,6 +177,25 @@ else AC_DEFINE([FLAC__HAS_NEONINTRIN], 0) fi +if test "x$ac_cv_header_riscv_vector_h" = xyes -a "x$asm_opt" = xyes; then + AC_DEFINE([FLAC__HAS_RISCVINTRIN], 1, [Set to 1 if is available.]) + AC_MSG_CHECKING([whether riscv_vector.h has Vector functions]) + AC_COMPILE_IFELSE( + [AC_LANG_PROGRAM([[#include ]], + [[size_t vl = __riscv_vsetvl_e64m2(8); vfloat64m2_t a = __riscv_vfmv_v_f_f64m2(0.5, vl);]])], + [AC_MSG_RESULT([yes]) + has_riscvvector=yes], + [AC_MSG_RESULT([no])]) + if test "x$has_riscvvector" = xyes; then + AC_DEFINE([FLAC__HAS_RISCVINTRIN], 1, [Set to 1 if has vector instructions.]) + asm_optimisation=yes + else + AC_DEFINE([FLAC__HAS_RISCVINTRIN], 0) + fi +else + AC_DEFINE([FLAC__HAS_RISCVINTRIN], 0) +fi + case "$host" in i386-*-openbsd3.[[0-3]]) OBJ_FORMAT=aoutb ;; *-*-cygwin|*mingw*) OBJ_FORMAT=win32 ;; diff --git a/src/libFLAC/CMakeLists.txt b/src/libFLAC/CMakeLists.txt index cf7368f60c..2e5fe52cdc 100644 --- a/src/libFLAC/CMakeLists.txt +++ b/src/libFLAC/CMakeLists.txt @@ -1,7 +1,9 @@ option(WITH_ASM "Use any assembly optimization routines" ON) +option(RISCV_VECTOR "Use RiscV Vector Optimization" OFF) check_include_file("cpuid.h" HAVE_CPUID_H) check_include_file("sys/param.h" HAVE_SYS_PARAM_H) +check_include_file("sys/auxv.h" HAVE_SYS_AUXV_H) set(CMAKE_REQUIRED_LIBRARIES m) check_function_exists(lround HAVE_LROUND) @@ -9,6 +11,7 @@ check_function_exists(lround HAVE_LROUND) include(CheckCSourceCompiles) include(CheckCPUArch) include(CheckA64NEON) +include(CheckRV64Vector) check_cpu_arch_x64(FLAC__CPU_X86_64) if(NOT FLAC__CPU_X86_64) @@ -26,12 +29,24 @@ else() if(FLAC__CPU_ARM64) check_a64neon(FLAC__HAS_A64NEONINTRIN) endif() + + check_cpu_arch_riscv64(FLAC__CPU_RISCV64) + if(FLAC__CPU_RISCV64) + check_rv64vector(FLAC__HAS_RISCVINTRIN) + if (RISCV_VECTOR AND FLAC__HAS_RISCVINTRIN) + set_property(SOURCE lpc_intrin_riscv.c cpu.c APPEND_STRING PROPERTY COMPILE_FLAGS " -march=rv64gcv ") + endif() + endif() endif() if(NOT WITH_ASM) add_definitions(-DFLAC__NO_ASM) endif() +if(RISCV_VECTOR) + add_definitions(-DFLAC__RISCV_VECTOR) +endif() + include_directories("include") add_library(FLAC @@ -53,6 +68,7 @@ add_library(FLAC lpc_intrin_sse41.c lpc_intrin_avx2.c lpc_intrin_fma.c + lpc_intrin_riscv.c md5.c memory.c metadata_iterators.c diff --git a/src/libFLAC/Makefile.am b/src/libFLAC/Makefile.am index 618939dfc3..a4433cc23a 100644 --- a/src/libFLAC/Makefile.am +++ b/src/libFLAC/Makefile.am @@ -100,6 +100,7 @@ libFLAC_sources = \ lpc_intrin_avx2.c \ lpc_intrin_fma.c \ lpc_intrin_neon.c \ + lpc_intrin_riscv.c \ md5.c \ memory.c \ metadata_iterators.c \ diff --git a/src/libFLAC/cpu.c b/src/libFLAC/cpu.c index d088e3c0b7..b381a55867 100644 --- a/src/libFLAC/cpu.c +++ b/src/libFLAC/cpu.c @@ -57,6 +57,10 @@ #include #endif +#if defined(HAVE_RISCV_VECTOR_H) && defined(FLAC__RISCV_VECTOR) && defined(FLAC__HAS_RISCVINTRIN) +#include +#endif + #if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN && !defined FLAC__NO_ASM /* these are flags in EDX of CPUID AX=00000001 */ @@ -231,6 +235,26 @@ x86_cpu_info (FLAC__CPUInfo *info) #endif } +static void +rv64_cpu_info(FLAC__CPUInfo *info) +{ +#if defined(FLAC__CPU_RISCV64) && defined(FLAC__HAS_RISCVINTRIN) && !defined(FLAC__NO_ASM) && defined(HAVE_SYS_AUXV_H) && defined(FLAC__RISCV_VECTOR) && defined(HAVE_RISCV_VECTOR_H) +#define ISA_V_HWCAP (1 << ('v' - 'a')) + // Check that the kernel and the hardware support RiscV Vector. + unsigned long hw_cap = getauxval(AT_HWCAP); + info->rv64.has_vector = (hw_cap & ISA_V_HWCAP) == ISA_V_HWCAP; + if(info->rv64.has_vector) { + info->rv64.vlenb = __riscv_vsetvlmax_e8m1(); + } + else { + info->rv64.vlenb = 0; + } +#else + info->rv64.has_vector = false; + info->rv64.vlenb = 0; +#endif +} + void FLAC__cpu_info (FLAC__CPUInfo *info) { memset(info, 0, sizeof(*info)); @@ -239,6 +263,8 @@ void FLAC__cpu_info (FLAC__CPUInfo *info) info->type = FLAC__CPUINFO_TYPE_IA32; #elif defined FLAC__CPU_X86_64 info->type = FLAC__CPUINFO_TYPE_X86_64; +#elif defined FLAC__CPU_RISCV64 + info->type = FLAC__CPUINFO_TYPE_RISCV_64; #else info->type = FLAC__CPUINFO_TYPE_UNKNOWN; #endif @@ -248,6 +274,9 @@ void FLAC__cpu_info (FLAC__CPUInfo *info) case FLAC__CPUINFO_TYPE_X86_64: x86_cpu_info (info); break; + case FLAC__CPUINFO_TYPE_RISCV_64: + rv64_cpu_info(info); + break; default: info->use_asm = false; break; diff --git a/src/libFLAC/include/private/cpu.h b/src/libFLAC/include/private/cpu.h index 8843c74bfe..c961777f85 100644 --- a/src/libFLAC/include/private/cpu.h +++ b/src/libFLAC/include/private/cpu.h @@ -162,6 +162,7 @@ typedef enum { FLAC__CPUINFO_TYPE_IA32, FLAC__CPUINFO_TYPE_X86_64, + FLAC__CPUINFO_TYPE_RISCV_64, FLAC__CPUINFO_TYPE_UNKNOWN } FLAC__CPUInfo_Type; @@ -183,10 +184,16 @@ typedef struct { FLAC__bool bmi2; } FLAC__CPUInfo_x86; +typedef struct { + FLAC__bool has_vector; + FLAC__uint32 vlenb; // Vector register length in bytes if CPU supports it. +} FLAC__CPUInfo_RV64; + typedef struct { FLAC__bool use_asm; FLAC__CPUInfo_Type type; FLAC__CPUInfo_x86 x86; + FLAC__CPUInfo_RV64 rv64; } FLAC__CPUInfo; void FLAC__cpu_info(FLAC__CPUInfo *info); diff --git a/src/libFLAC/include/private/lpc.h b/src/libFLAC/include/private/lpc.h index 766f0560ea..cd934e579f 100644 --- a/src/libFLAC/include/private/lpc.h +++ b/src/libFLAC/include/private/lpc.h @@ -92,6 +92,9 @@ void FLAC__lpc_compute_autocorrelation_intrin_neon_lag_8(const FLAC__real data[] void FLAC__lpc_compute_autocorrelation_intrin_neon_lag_10(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[]); void FLAC__lpc_compute_autocorrelation_intrin_neon_lag_14(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[]); #endif +#if defined FLAC__CPU_RISCV64 && FLAC__HAS_RISCVINTRIN +void FLAC__lpc_compute_autocorrelation_intrin_riscv(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[]); +#endif #endif /* FLAC__NO_ASM */ /* @@ -162,6 +165,10 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_neon(const FLAC__in void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_neon(const FLAC__int32 *data, uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 residual[]); # endif +#ifdef FLAC__CPU_RISCV64 +void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_riscv(const FLAC__int32 *data, uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 residual[]); +#endif + # if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN # ifdef FLAC__SSE2_SUPPORTED void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC__int32 *data, uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 residual[]); diff --git a/src/libFLAC/lpc_intrin_riscv.c b/src/libFLAC/lpc_intrin_riscv.c new file mode 100644 index 0000000000..bd180cadb3 --- /dev/null +++ b/src/libFLAC/lpc_intrin_riscv.c @@ -0,0 +1,634 @@ +/* libFLAC - Free Lossless Audio Codec library + * Copyright (C) 2000-2009 Josh Coalson + * Copyright (C) 2011-2023 Xiph.Org Foundation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * - Neither the name of the Xiph.org Foundation nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "private/cpu.h" + +#ifndef FLAC__INTEGER_ONLY_LIBRARY +#ifndef FLAC__NO_ASM +#ifdef FLAC__RISCV_VECTOR +#ifdef HAVE_RISCV_VECTOR_H +#if defined FLAC__CPU_RISCV64 && FLAC__HAS_RISCVINTRIN +#include "private/lpc.h" +#include "FLAC/assert.h" +#include "FLAC/format.h" +#include "private/macros.h" +#include + +void FLAC__lpc_compute_autocorrelation_intrin_riscv(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[]) +{ + uint32_t i; + vfloat64m8_t sample; + + // Set LMUL=8 to group vector registers into groups of 8. Assuming a VLEN of at + // least 128b that should yield enough space for 128x8=1024b / vector register. + // 1024b is enough for 16 Float64s so we should be able to process the entire lag + // window with a single virtual register. + size_t vl = __riscv_vsetvl_e64m8(lag); + + vfloat64m8_t sums = __riscv_vfmv_v_f_f64m8(0.0, vl); + vfloat64m8_t d = __riscv_vfmv_v_f_f64m8(0.0, vl); + + FLAC__ASSERT(vl == lag); + + for(i = 0; i < data_len; i++) { + const double new_sample = data[i]; + sample = __riscv_vfmv_v_f_f64m8(new_sample, vl); + d = __riscv_vfslide1up_vf_f64m8(d, new_sample, vl); + sums = __riscv_vfmacc_vv_f64m8(sums, sample, d, vl); + } + __riscv_vse64_v_f64m8(autoc, sums, vl); +} + +void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_riscv(const FLAC__int32 *data, uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 residual[]) +{ + int i; + FLAC__int32 sum; + vint32m1_t data_vec; + vint32m1_t q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11; + vint32m1_t d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11; + vint32m1_t summ, mull; + size_t vl = __riscv_vsetvl_e32m1(4); + + // TODO(gkalsi): Fix this here? + FLAC__ASSERT(vl == 4); + + FLAC__ASSERT(order > 0); + FLAC__ASSERT(order <= 32); + + if(order <= 12) { + if(order > 8) { + if(order > 10) { + if(order == 12) { + q0 = __riscv_vmv_v_x_i32m1(qlp_coeff[0], vl); + q1 = __riscv_vmv_v_x_i32m1(qlp_coeff[1], vl); + q2 = __riscv_vmv_v_x_i32m1(qlp_coeff[2], vl); + q3 = __riscv_vmv_v_x_i32m1(qlp_coeff[3], vl); + q4 = __riscv_vmv_v_x_i32m1(qlp_coeff[4], vl); + q5 = __riscv_vmv_v_x_i32m1(qlp_coeff[5], vl); + q6 = __riscv_vmv_v_x_i32m1(qlp_coeff[6], vl); + q7 = __riscv_vmv_v_x_i32m1(qlp_coeff[7], vl); + q8 = __riscv_vmv_v_x_i32m1(qlp_coeff[8], vl); + q9 = __riscv_vmv_v_x_i32m1(qlp_coeff[9], vl); + q10 = __riscv_vmv_v_x_i32m1(qlp_coeff[10], vl); + q11 = __riscv_vmv_v_x_i32m1(qlp_coeff[11], vl); + + for(i = 0; i < (int)data_len - 3; i += 4) { + d0 = __riscv_vle32_v_i32m1(data + i - 1, vl); + d1 = __riscv_vle32_v_i32m1(data + i - 2, vl); + d2 = __riscv_vle32_v_i32m1(data + i - 3, vl); + d3 = __riscv_vle32_v_i32m1(data + i - 4, vl); + d4 = __riscv_vle32_v_i32m1(data + i - 5, vl); + d5 = __riscv_vle32_v_i32m1(data + i - 6, vl); + d6 = __riscv_vle32_v_i32m1(data + i - 7, vl); + d7 = __riscv_vle32_v_i32m1(data + i - 8, vl); + d8 = __riscv_vle32_v_i32m1(data + i - 9, vl); + d9 = __riscv_vle32_v_i32m1(data + i - 10, vl); + d10 = __riscv_vle32_v_i32m1(data + i - 11, vl); + d11 = __riscv_vle32_v_i32m1(data + i - 12, vl); + + summ = __riscv_vmul_vv_i32m1(q11, d11, vl); + mull = __riscv_vmul_vv_i32m1(q10, d10, vl); + summ = __riscv_vadd_vv_i32m1(summ, mull, vl); + mull = __riscv_vmul_vv_i32m1(q9, d9, vl); + summ = __riscv_vadd_vv_i32m1(summ, mull, vl); + mull = __riscv_vmul_vv_i32m1(q8, d8, vl); + summ = __riscv_vadd_vv_i32m1(summ, mull, vl); + mull = __riscv_vmul_vv_i32m1(q7, d7, vl); + summ = __riscv_vadd_vv_i32m1(summ, mull, vl); + mull = __riscv_vmul_vv_i32m1(q6, d6, vl); + summ = __riscv_vadd_vv_i32m1(summ, mull, vl); + mull = __riscv_vmul_vv_i32m1(q5, d5, vl); + summ = __riscv_vadd_vv_i32m1(summ, mull, vl); + mull = __riscv_vmul_vv_i32m1(q4, d4, vl); + summ = __riscv_vadd_vv_i32m1(summ, mull, vl); + mull = __riscv_vmul_vv_i32m1(q3, d3, vl); + summ = __riscv_vadd_vv_i32m1(summ, mull, vl); + mull = __riscv_vmul_vv_i32m1(q2, d2, vl); + summ = __riscv_vadd_vv_i32m1(summ, mull, vl); + mull = __riscv_vmul_vv_i32m1(q1, d1, vl); + summ = __riscv_vadd_vv_i32m1(summ, mull, vl); + mull = __riscv_vmul_vv_i32m1(q0, d0, vl); + summ = __riscv_vadd_vv_i32m1(summ, mull, vl); + summ = __riscv_vsra_vx_i32m1(summ, lp_quantization, vl); + + data_vec = __riscv_vle32_v_i32m1(data + i, vl); + data_vec = __riscv_vsub_vv_i32m1(data_vec, summ, vl); + __riscv_vse32_v_i32m1(residual + i, data_vec, vl); + } + } + else { /* order == 11 */ + q0 = __riscv_vmv_v_x_i32m1(qlp_coeff[0], vl); + q1 = __riscv_vmv_v_x_i32m1(qlp_coeff[1], vl); + q2 = __riscv_vmv_v_x_i32m1(qlp_coeff[2], vl); + q3 = __riscv_vmv_v_x_i32m1(qlp_coeff[3], vl); + q4 = __riscv_vmv_v_x_i32m1(qlp_coeff[4], vl); + q5 = __riscv_vmv_v_x_i32m1(qlp_coeff[5], vl); + q6 = __riscv_vmv_v_x_i32m1(qlp_coeff[6], vl); + q7 = __riscv_vmv_v_x_i32m1(qlp_coeff[7], vl); + q8 = __riscv_vmv_v_x_i32m1(qlp_coeff[8], vl); + q9 = __riscv_vmv_v_x_i32m1(qlp_coeff[9], vl); + q10 = __riscv_vmv_v_x_i32m1(qlp_coeff[10], vl); + + for(i = 0; i < (int)data_len - 3; i += 4) { + d0 = __riscv_vle32_v_i32m1(data + i - 1, vl); + d1 = __riscv_vle32_v_i32m1(data + i - 2, vl); + d2 = __riscv_vle32_v_i32m1(data + i - 3, vl); + d3 = __riscv_vle32_v_i32m1(data + i - 4, vl); + d4 = __riscv_vle32_v_i32m1(data + i - 5, vl); + d5 = __riscv_vle32_v_i32m1(data + i - 6, vl); + d6 = __riscv_vle32_v_i32m1(data + i - 7, vl); + d7 = __riscv_vle32_v_i32m1(data + i - 8, vl); + d8 = __riscv_vle32_v_i32m1(data + i - 9, vl); + d9 = __riscv_vle32_v_i32m1(data + i - 10, vl); + d10 = __riscv_vle32_v_i32m1(data + i - 11, vl); + + summ = __riscv_vmul_vv_i32m1(q10, d10, vl); + mull = __riscv_vmul_vv_i32m1(q9, d9, vl); + summ = __riscv_vadd_vv_i32m1(summ, mull, vl); + mull = __riscv_vmul_vv_i32m1(q8, d8, vl); + summ = __riscv_vadd_vv_i32m1(summ, mull, vl); + mull = __riscv_vmul_vv_i32m1(q7, d7, vl); + summ = __riscv_vadd_vv_i32m1(summ, mull, vl); + mull = __riscv_vmul_vv_i32m1(q6, d6, vl); + summ = __riscv_vadd_vv_i32m1(summ, mull, vl); + mull = __riscv_vmul_vv_i32m1(q5, d5, vl); + summ = __riscv_vadd_vv_i32m1(summ, mull, vl); + mull = __riscv_vmul_vv_i32m1(q4, d4, vl); + summ = __riscv_vadd_vv_i32m1(summ, mull, vl); + mull = __riscv_vmul_vv_i32m1(q3, d3, vl); + summ = __riscv_vadd_vv_i32m1(summ, mull, vl); + mull = __riscv_vmul_vv_i32m1(q2, d2, vl); + summ = __riscv_vadd_vv_i32m1(summ, mull, vl); + mull = __riscv_vmul_vv_i32m1(q1, d1, vl); + summ = __riscv_vadd_vv_i32m1(summ, mull, vl); + mull = __riscv_vmul_vv_i32m1(q0, d0, vl); + summ = __riscv_vadd_vv_i32m1(summ, mull, vl); + summ = __riscv_vsra_vx_i32m1(summ, lp_quantization, vl); + + data_vec = __riscv_vle32_v_i32m1(data + i, vl); + data_vec = __riscv_vsub_vv_i32m1(data_vec, summ, vl); + __riscv_vse32_v_i32m1(residual + i, data_vec, vl); + } + } + } + else { + if(order == 10) { + q0 = __riscv_vmv_v_x_i32m1(qlp_coeff[0], vl); + q1 = __riscv_vmv_v_x_i32m1(qlp_coeff[1], vl); + q2 = __riscv_vmv_v_x_i32m1(qlp_coeff[2], vl); + q3 = __riscv_vmv_v_x_i32m1(qlp_coeff[3], vl); + q4 = __riscv_vmv_v_x_i32m1(qlp_coeff[4], vl); + q5 = __riscv_vmv_v_x_i32m1(qlp_coeff[5], vl); + q6 = __riscv_vmv_v_x_i32m1(qlp_coeff[6], vl); + q7 = __riscv_vmv_v_x_i32m1(qlp_coeff[7], vl); + q8 = __riscv_vmv_v_x_i32m1(qlp_coeff[8], vl); + q9 = __riscv_vmv_v_x_i32m1(qlp_coeff[9], vl); + + for(i = 0; i < (int)data_len - 3; i += 4) { + d0 = __riscv_vle32_v_i32m1(data + i - 1, vl); + d1 = __riscv_vle32_v_i32m1(data + i - 2, vl); + d2 = __riscv_vle32_v_i32m1(data + i - 3, vl); + d3 = __riscv_vle32_v_i32m1(data + i - 4, vl); + d4 = __riscv_vle32_v_i32m1(data + i - 5, vl); + d5 = __riscv_vle32_v_i32m1(data + i - 6, vl); + d6 = __riscv_vle32_v_i32m1(data + i - 7, vl); + d7 = __riscv_vle32_v_i32m1(data + i - 8, vl); + d8 = __riscv_vle32_v_i32m1(data + i - 9, vl); + d9 = __riscv_vle32_v_i32m1(data + i - 10, vl); + + summ = __riscv_vmul_vv_i32m1(q9, d9, vl); + mull = __riscv_vmul_vv_i32m1(q8, d8, vl); + summ = __riscv_vadd_vv_i32m1(summ, mull, vl); + mull = __riscv_vmul_vv_i32m1(q7, d7, vl); + summ = __riscv_vadd_vv_i32m1(summ, mull, vl); + mull = __riscv_vmul_vv_i32m1(q6, d6, vl); + summ = __riscv_vadd_vv_i32m1(summ, mull, vl); + mull = __riscv_vmul_vv_i32m1(q5, d5, vl); + summ = __riscv_vadd_vv_i32m1(summ, mull, vl); + mull = __riscv_vmul_vv_i32m1(q4, d4, vl); + summ = __riscv_vadd_vv_i32m1(summ, mull, vl); + mull = __riscv_vmul_vv_i32m1(q3, d3, vl); + summ = __riscv_vadd_vv_i32m1(summ, mull, vl); + mull = __riscv_vmul_vv_i32m1(q2, d2, vl); + summ = __riscv_vadd_vv_i32m1(summ, mull, vl); + mull = __riscv_vmul_vv_i32m1(q1, d1, vl); + summ = __riscv_vadd_vv_i32m1(summ, mull, vl); + mull = __riscv_vmul_vv_i32m1(q0, d0, vl); + summ = __riscv_vadd_vv_i32m1(summ, mull, vl); + summ = __riscv_vsra_vx_i32m1(summ, lp_quantization, vl); + + data_vec = __riscv_vle32_v_i32m1(data + i, vl); + data_vec = __riscv_vsub_vv_i32m1(data_vec, summ, vl); + __riscv_vse32_v_i32m1(residual + i, data_vec, vl); + } + } + else { /* order == 9 */ + q0 = __riscv_vmv_v_x_i32m1(qlp_coeff[0], vl); + q1 = __riscv_vmv_v_x_i32m1(qlp_coeff[1], vl); + q2 = __riscv_vmv_v_x_i32m1(qlp_coeff[2], vl); + q3 = __riscv_vmv_v_x_i32m1(qlp_coeff[3], vl); + q4 = __riscv_vmv_v_x_i32m1(qlp_coeff[4], vl); + q5 = __riscv_vmv_v_x_i32m1(qlp_coeff[5], vl); + q6 = __riscv_vmv_v_x_i32m1(qlp_coeff[6], vl); + q7 = __riscv_vmv_v_x_i32m1(qlp_coeff[7], vl); + q8 = __riscv_vmv_v_x_i32m1(qlp_coeff[8], vl); + + for(i = 0; i < (int)data_len - 3; i += 4) { + d0 = __riscv_vle32_v_i32m1(data + i - 1, vl); + d1 = __riscv_vle32_v_i32m1(data + i - 2, vl); + d2 = __riscv_vle32_v_i32m1(data + i - 3, vl); + d3 = __riscv_vle32_v_i32m1(data + i - 4, vl); + d4 = __riscv_vle32_v_i32m1(data + i - 5, vl); + d5 = __riscv_vle32_v_i32m1(data + i - 6, vl); + d6 = __riscv_vle32_v_i32m1(data + i - 7, vl); + d7 = __riscv_vle32_v_i32m1(data + i - 8, vl); + d8 = __riscv_vle32_v_i32m1(data + i - 9, vl); + + summ = __riscv_vmul_vv_i32m1(q8, d8, vl); + mull = __riscv_vmul_vv_i32m1(q7, d7, vl); + summ = __riscv_vadd_vv_i32m1(summ, mull, vl); + mull = __riscv_vmul_vv_i32m1(q6, d6, vl); + summ = __riscv_vadd_vv_i32m1(summ, mull, vl); + mull = __riscv_vmul_vv_i32m1(q5, d5, vl); + summ = __riscv_vadd_vv_i32m1(summ, mull, vl); + mull = __riscv_vmul_vv_i32m1(q4, d4, vl); + summ = __riscv_vadd_vv_i32m1(summ, mull, vl); + mull = __riscv_vmul_vv_i32m1(q3, d3, vl); + summ = __riscv_vadd_vv_i32m1(summ, mull, vl); + mull = __riscv_vmul_vv_i32m1(q2, d2, vl); + summ = __riscv_vadd_vv_i32m1(summ, mull, vl); + mull = __riscv_vmul_vv_i32m1(q1, d1, vl); + summ = __riscv_vadd_vv_i32m1(summ, mull, vl); + mull = __riscv_vmul_vv_i32m1(q0, d0, vl); + summ = __riscv_vadd_vv_i32m1(summ, mull, vl); + summ = __riscv_vsra_vx_i32m1(summ, lp_quantization, vl); + + data_vec = __riscv_vle32_v_i32m1(data + i, vl); + data_vec = __riscv_vsub_vv_i32m1(data_vec, summ, vl); + __riscv_vse32_v_i32m1(residual + i, data_vec, vl); + } + } + } + } + else if(order > 4) { + if(order > 6) { + if(order == 8) { + q0 = __riscv_vmv_v_x_i32m1(qlp_coeff[0], vl); + q1 = __riscv_vmv_v_x_i32m1(qlp_coeff[1], vl); + q2 = __riscv_vmv_v_x_i32m1(qlp_coeff[2], vl); + q3 = __riscv_vmv_v_x_i32m1(qlp_coeff[3], vl); + q4 = __riscv_vmv_v_x_i32m1(qlp_coeff[4], vl); + q5 = __riscv_vmv_v_x_i32m1(qlp_coeff[5], vl); + q6 = __riscv_vmv_v_x_i32m1(qlp_coeff[6], vl); + q7 = __riscv_vmv_v_x_i32m1(qlp_coeff[7], vl); + + for(i = 0; i < (int)data_len - 3; i += 4) { + d0 = __riscv_vle32_v_i32m1(data + i - 1, vl); + d1 = __riscv_vle32_v_i32m1(data + i - 2, vl); + d2 = __riscv_vle32_v_i32m1(data + i - 3, vl); + d3 = __riscv_vle32_v_i32m1(data + i - 4, vl); + d4 = __riscv_vle32_v_i32m1(data + i - 5, vl); + d5 = __riscv_vle32_v_i32m1(data + i - 6, vl); + d6 = __riscv_vle32_v_i32m1(data + i - 7, vl); + d7 = __riscv_vle32_v_i32m1(data + i - 8, vl); + + summ = __riscv_vmul_vv_i32m1(q7, d7, vl); + mull = __riscv_vmul_vv_i32m1(q6, d6, vl); + summ = __riscv_vadd_vv_i32m1(summ, mull, vl); + mull = __riscv_vmul_vv_i32m1(q5, d5, vl); + summ = __riscv_vadd_vv_i32m1(summ, mull, vl); + mull = __riscv_vmul_vv_i32m1(q4, d4, vl); + summ = __riscv_vadd_vv_i32m1(summ, mull, vl); + mull = __riscv_vmul_vv_i32m1(q3, d3, vl); + summ = __riscv_vadd_vv_i32m1(summ, mull, vl); + mull = __riscv_vmul_vv_i32m1(q2, d2, vl); + summ = __riscv_vadd_vv_i32m1(summ, mull, vl); + mull = __riscv_vmul_vv_i32m1(q1, d1, vl); + summ = __riscv_vadd_vv_i32m1(summ, mull, vl); + mull = __riscv_vmul_vv_i32m1(q0, d0, vl); + summ = __riscv_vadd_vv_i32m1(summ, mull, vl); + summ = __riscv_vsra_vx_i32m1(summ, lp_quantization, vl); + + data_vec = __riscv_vle32_v_i32m1(data + i, vl); + data_vec = __riscv_vsub_vv_i32m1(data_vec, summ, vl); + __riscv_vse32_v_i32m1(residual + i, data_vec, vl); + } + } + else { /* order == 7 */ + q0 = __riscv_vmv_v_x_i32m1(qlp_coeff[0], vl); + q1 = __riscv_vmv_v_x_i32m1(qlp_coeff[1], vl); + q2 = __riscv_vmv_v_x_i32m1(qlp_coeff[2], vl); + q3 = __riscv_vmv_v_x_i32m1(qlp_coeff[3], vl); + q4 = __riscv_vmv_v_x_i32m1(qlp_coeff[4], vl); + q5 = __riscv_vmv_v_x_i32m1(qlp_coeff[5], vl); + q6 = __riscv_vmv_v_x_i32m1(qlp_coeff[6], vl); + + for(i = 0; i < (int)data_len - 3; i += 4) { + d0 = __riscv_vle32_v_i32m1(data + i - 1, vl); + d1 = __riscv_vle32_v_i32m1(data + i - 2, vl); + d2 = __riscv_vle32_v_i32m1(data + i - 3, vl); + d3 = __riscv_vle32_v_i32m1(data + i - 4, vl); + d4 = __riscv_vle32_v_i32m1(data + i - 5, vl); + d5 = __riscv_vle32_v_i32m1(data + i - 6, vl); + d6 = __riscv_vle32_v_i32m1(data + i - 7, vl); + summ = __riscv_vmul_vv_i32m1(q6, d6, vl); + mull = __riscv_vmul_vv_i32m1(q5, d5, vl); + summ = __riscv_vadd_vv_i32m1(summ, mull, vl); + mull = __riscv_vmul_vv_i32m1(q4, d4, vl); + summ = __riscv_vadd_vv_i32m1(summ, mull, vl); + mull = __riscv_vmul_vv_i32m1(q3, d3, vl); + summ = __riscv_vadd_vv_i32m1(summ, mull, vl); + mull = __riscv_vmul_vv_i32m1(q2, d2, vl); + summ = __riscv_vadd_vv_i32m1(summ, mull, vl); + mull = __riscv_vmul_vv_i32m1(q1, d1, vl); + summ = __riscv_vadd_vv_i32m1(summ, mull, vl); + mull = __riscv_vmul_vv_i32m1(q0, d0, vl); + summ = __riscv_vadd_vv_i32m1(summ, mull, vl); + summ = __riscv_vsra_vx_i32m1(summ, lp_quantization, vl); + + data_vec = __riscv_vle32_v_i32m1(data + i, vl); + data_vec = __riscv_vsub_vv_i32m1(data_vec, summ, vl); + __riscv_vse32_v_i32m1(residual + i, data_vec, vl); + } + } + } + else { + if(order == 6) { + q0 = __riscv_vmv_v_x_i32m1(qlp_coeff[0], vl); + q1 = __riscv_vmv_v_x_i32m1(qlp_coeff[1], vl); + q2 = __riscv_vmv_v_x_i32m1(qlp_coeff[2], vl); + q3 = __riscv_vmv_v_x_i32m1(qlp_coeff[3], vl); + q4 = __riscv_vmv_v_x_i32m1(qlp_coeff[4], vl); + q5 = __riscv_vmv_v_x_i32m1(qlp_coeff[5], vl); + + for(i = 0; i < (int)data_len - 3; i += 4) { + d0 = __riscv_vle32_v_i32m1(data + i - 1, vl); + d1 = __riscv_vle32_v_i32m1(data + i - 2, vl); + d2 = __riscv_vle32_v_i32m1(data + i - 3, vl); + d3 = __riscv_vle32_v_i32m1(data + i - 4, vl); + d4 = __riscv_vle32_v_i32m1(data + i - 5, vl); + d5 = __riscv_vle32_v_i32m1(data + i - 6, vl); + + summ = __riscv_vmul_vv_i32m1(q5, d5, vl); + mull = __riscv_vmul_vv_i32m1(q4, d4, vl); + summ = __riscv_vadd_vv_i32m1(summ, mull, vl); + mull = __riscv_vmul_vv_i32m1(q3, d3, vl); + summ = __riscv_vadd_vv_i32m1(summ, mull, vl); + mull = __riscv_vmul_vv_i32m1(q2, d2, vl); + summ = __riscv_vadd_vv_i32m1(summ, mull, vl); + mull = __riscv_vmul_vv_i32m1(q1, d1, vl); + summ = __riscv_vadd_vv_i32m1(summ, mull, vl); + mull = __riscv_vmul_vv_i32m1(q0, d0, vl); + summ = __riscv_vadd_vv_i32m1(summ, mull, vl); + summ = __riscv_vsra_vx_i32m1(summ, lp_quantization, vl); + + data_vec = __riscv_vle32_v_i32m1(data + i, vl); + data_vec = __riscv_vsub_vv_i32m1(data_vec, summ, vl); + __riscv_vse32_v_i32m1(residual + i, data_vec, vl); + } + } + else { /* order == 5 */ + q0 = __riscv_vmv_v_x_i32m1(qlp_coeff[0], vl); + q1 = __riscv_vmv_v_x_i32m1(qlp_coeff[1], vl); + q2 = __riscv_vmv_v_x_i32m1(qlp_coeff[2], vl); + q3 = __riscv_vmv_v_x_i32m1(qlp_coeff[3], vl); + q4 = __riscv_vmv_v_x_i32m1(qlp_coeff[4], vl); + + for(i = 0; i < (int)data_len - 3; i += 4) { + d0 = __riscv_vle32_v_i32m1(data + i - 1, vl); + d1 = __riscv_vle32_v_i32m1(data + i - 2, vl); + d2 = __riscv_vle32_v_i32m1(data + i - 3, vl); + d3 = __riscv_vle32_v_i32m1(data + i - 4, vl); + d4 = __riscv_vle32_v_i32m1(data + i - 5, vl); + + summ = __riscv_vmul_vv_i32m1(q4, d4, vl); + mull = __riscv_vmul_vv_i32m1(q3, d3, vl); + summ = __riscv_vadd_vv_i32m1(summ, mull, vl); + mull = __riscv_vmul_vv_i32m1(q2, d2, vl); + summ = __riscv_vadd_vv_i32m1(summ, mull, vl); + mull = __riscv_vmul_vv_i32m1(q1, d1, vl); + summ = __riscv_vadd_vv_i32m1(summ, mull, vl); + mull = __riscv_vmul_vv_i32m1(q0, d0, vl); + summ = __riscv_vadd_vv_i32m1(summ, mull, vl); + summ = __riscv_vsra_vx_i32m1(summ, lp_quantization, vl); + + data_vec = __riscv_vle32_v_i32m1(data + i, vl); + data_vec = __riscv_vsub_vv_i32m1(data_vec, summ, vl); + __riscv_vse32_v_i32m1(residual + i, data_vec, vl); + } + } + } + } + else { + if(order > 2) { + if(order == 4) { + q0 = __riscv_vmv_v_x_i32m1(qlp_coeff[0], vl); + q1 = __riscv_vmv_v_x_i32m1(qlp_coeff[1], vl); + q2 = __riscv_vmv_v_x_i32m1(qlp_coeff[2], vl); + q3 = __riscv_vmv_v_x_i32m1(qlp_coeff[3], vl); + + for(i = 0; i < (int)data_len - 3; i += 4) { + d0 = __riscv_vle32_v_i32m1(data + i - 1, vl); + d1 = __riscv_vle32_v_i32m1(data + i - 2, vl); + d2 = __riscv_vle32_v_i32m1(data + i - 3, vl); + d3 = __riscv_vle32_v_i32m1(data + i - 4, vl); + + summ = __riscv_vmul_vv_i32m1(q3, d3, vl); + mull = __riscv_vmul_vv_i32m1(q2, d2, vl); + summ = __riscv_vadd_vv_i32m1(summ, mull, vl); + mull = __riscv_vmul_vv_i32m1(q1, d1, vl); + summ = __riscv_vadd_vv_i32m1(summ, mull, vl); + mull = __riscv_vmul_vv_i32m1(q0, d0, vl); + summ = __riscv_vadd_vv_i32m1(summ, mull, vl); + summ = __riscv_vsra_vx_i32m1(summ, lp_quantization, vl); + + data_vec = __riscv_vle32_v_i32m1(data + i, vl); + data_vec = __riscv_vsub_vv_i32m1(data_vec, summ, vl); + __riscv_vse32_v_i32m1(residual + i, data_vec, vl); + } + } + else { /* order == 3 */ + q0 = __riscv_vmv_v_x_i32m1(qlp_coeff[0], vl); + q1 = __riscv_vmv_v_x_i32m1(qlp_coeff[1], vl); + q2 = __riscv_vmv_v_x_i32m1(qlp_coeff[2], vl); + + for(i = 0; i < (int)data_len - 3; i += 4) { + d0 = __riscv_vle32_v_i32m1(data + i - 1, vl); + d1 = __riscv_vle32_v_i32m1(data + i - 2, vl); + d2 = __riscv_vle32_v_i32m1(data + i - 3, vl); + + summ = __riscv_vmul_vv_i32m1(q2, d2, vl); + mull = __riscv_vmul_vv_i32m1(q1, d1, vl); + summ = __riscv_vadd_vv_i32m1(summ, mull, vl); + mull = __riscv_vmul_vv_i32m1(q0, d0, vl); + summ = __riscv_vadd_vv_i32m1(summ, mull, vl); + summ = __riscv_vsra_vx_i32m1(summ, lp_quantization, vl); + + data_vec = __riscv_vle32_v_i32m1(data + i, vl); + data_vec = __riscv_vsub_vv_i32m1(data_vec, summ, vl); + __riscv_vse32_v_i32m1(residual + i, data_vec, vl); + } + } + } + else { + if(order == 2) { + q0 = __riscv_vmv_v_x_i32m1(qlp_coeff[0], vl); + q1 = __riscv_vmv_v_x_i32m1(qlp_coeff[1], vl); + + for(i = 0; i < (int)data_len - 3; i += 4) { + d0 = __riscv_vle32_v_i32m1(data + i - 1, vl); + d1 = __riscv_vle32_v_i32m1(data + i - 2, vl); + + summ = __riscv_vmul_vv_i32m1(q1, d1, vl); + mull = __riscv_vmul_vv_i32m1(q0, d0, vl); + summ = __riscv_vadd_vv_i32m1(summ, mull, vl); + summ = __riscv_vsra_vx_i32m1(summ, lp_quantization, vl); + + data_vec = __riscv_vle32_v_i32m1(data + i, vl); + data_vec = __riscv_vsub_vv_i32m1(data_vec, summ, vl); + __riscv_vse32_v_i32m1(residual + i, data_vec, vl); + } + } + else { /* order == 1 */ + q0 = __riscv_vmv_v_x_i32m1(qlp_coeff[0], vl); + + for(i = 0; i < (int)data_len - 3; i += 4) { + d0 = __riscv_vle32_v_i32m1(data + i - 1, vl); + + summ = __riscv_vmul_vv_i32m1(q0, d0, vl); + summ = __riscv_vsra_vx_i32m1(summ, lp_quantization, vl); + + data_vec = __riscv_vle32_v_i32m1(data + i, vl); + data_vec = __riscv_vsub_vv_i32m1(data_vec, summ, vl); + __riscv_vse32_v_i32m1(residual + i, data_vec, vl); + } + } + } + } + for(; i < (int)data_len; i++) { + sum = 0; + switch(order) { + case 12: + sum += qlp_coeff[11] * data[i - 12]; /* Falls through. */ + case 11: + sum += qlp_coeff[10] * data[i - 11]; /* Falls through. */ + case 10: + sum += qlp_coeff[9] * data[i - 10]; /* Falls through. */ + case 9: + sum += qlp_coeff[8] * data[i - 9]; /* Falls through. */ + case 8: + sum += qlp_coeff[7] * data[i - 8]; /* Falls through. */ + case 7: + sum += qlp_coeff[6] * data[i - 7]; /* Falls through. */ + case 6: + sum += qlp_coeff[5] * data[i - 6]; /* Falls through. */ + case 5: + sum += qlp_coeff[4] * data[i - 5]; /* Falls through. */ + case 4: + sum += qlp_coeff[3] * data[i - 4]; /* Falls through. */ + case 3: + sum += qlp_coeff[2] * data[i - 3]; /* Falls through. */ + case 2: + sum += qlp_coeff[1] * data[i - 2]; /* Falls through. */ + case 1: + sum += qlp_coeff[0] * data[i - 1]; + } + residual[i] = data[i] - (sum >> lp_quantization); + } + } + else { /* order > 12 */ + for(i = 0; i < (int)data_len; i++) { + sum = 0; + switch(order) { + case 32: + sum += qlp_coeff[31] * data[i - 32]; /* Falls through. */ + case 31: + sum += qlp_coeff[30] * data[i - 31]; /* Falls through. */ + case 30: + sum += qlp_coeff[29] * data[i - 30]; /* Falls through. */ + case 29: + sum += qlp_coeff[28] * data[i - 29]; /* Falls through. */ + case 28: + sum += qlp_coeff[27] * data[i - 28]; /* Falls through. */ + case 27: + sum += qlp_coeff[26] * data[i - 27]; /* Falls through. */ + case 26: + sum += qlp_coeff[25] * data[i - 26]; /* Falls through. */ + case 25: + sum += qlp_coeff[24] * data[i - 25]; /* Falls through. */ + case 24: + sum += qlp_coeff[23] * data[i - 24]; /* Falls through. */ + case 23: + sum += qlp_coeff[22] * data[i - 23]; /* Falls through. */ + case 22: + sum += qlp_coeff[21] * data[i - 22]; /* Falls through. */ + case 21: + sum += qlp_coeff[20] * data[i - 21]; /* Falls through. */ + case 20: + sum += qlp_coeff[19] * data[i - 20]; /* Falls through. */ + case 19: + sum += qlp_coeff[18] * data[i - 19]; /* Falls through. */ + case 18: + sum += qlp_coeff[17] * data[i - 18]; /* Falls through. */ + case 17: + sum += qlp_coeff[16] * data[i - 17]; /* Falls through. */ + case 16: + sum += qlp_coeff[15] * data[i - 16]; /* Falls through. */ + case 15: + sum += qlp_coeff[14] * data[i - 15]; /* Falls through. */ + case 14: + sum += qlp_coeff[13] * data[i - 14]; /* Falls through. */ + case 13: + sum += qlp_coeff[12] * data[i - 13]; + sum += qlp_coeff[11] * data[i - 12]; + sum += qlp_coeff[10] * data[i - 11]; + sum += qlp_coeff[9] * data[i - 10]; + sum += qlp_coeff[8] * data[i - 9]; + sum += qlp_coeff[7] * data[i - 8]; + sum += qlp_coeff[6] * data[i - 7]; + sum += qlp_coeff[5] * data[i - 6]; + sum += qlp_coeff[4] * data[i - 5]; + sum += qlp_coeff[3] * data[i - 4]; + sum += qlp_coeff[2] * data[i - 3]; + sum += qlp_coeff[1] * data[i - 2]; + sum += qlp_coeff[0] * data[i - 1]; + } + residual[i] = data[i] - (sum >> lp_quantization); + } + } +} + +#endif /* FLAC__CPU_ARM64 && FLAC__HAS_ARCH64INTRIN */ +#endif /* HAVE_RISCV_VECTOR_H */ +#endif /* FLAC__RISCV_VECTOR */ +#endif /* FLAC__NO_ASM */ +#endif /* FLAC__INTEGER_ONLY_LIBRARY */ diff --git a/src/libFLAC/stream_encoder.c b/src/libFLAC/stream_encoder.c index c1c03e49ed..89e682ce01 100644 --- a/src/libFLAC/stream_encoder.c +++ b/src/libFLAC/stream_encoder.c @@ -959,6 +959,17 @@ static FLAC__StreamEncoderInitStatus init_stream_internal_( encoder->private_->local_lpc_compute_residual_from_qlp_coefficients_64bit = FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_neon; #endif /* defined FLAC__CPU_ARM64 && FLAC__HAS_NEONINTRIN */ +#if defined FLAC__CPU_RISCV64 && defined FLAC__HAS_RISCVINTRIN && defined HAVE_RISCV_VECTOR_H +#ifdef FLAC__RISCV_VECTOR + if(encoder->private_->cpuinfo.rv64.has_vector) { + if(encoder->protected_->max_lpc_order <= encoder->private_->cpuinfo.rv64.vlenb) { + encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_riscv; + } + encoder->private_->local_lpc_compute_residual_from_qlp_coefficients = FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_riscv; + } +#endif /* FLAC__RISCV_VECTOR */ +#endif /* defined FLAC__CPU_RISCV64 && FLAC__HAS_RISCVINTRIN && HAVE_RISCV_VECTOR_H */ + if(encoder->private_->cpuinfo.use_asm) { # ifdef FLAC__CPU_IA32 FLAC__ASSERT(encoder->private_->cpuinfo.type == FLAC__CPUINFO_TYPE_IA32);