From a9617fbdf95b78148e0cedac92a612186c583d1d Mon Sep 17 00:00:00 2001
From: Gurjant Kalsi <me@gurjantkalsi.com>
Date: Tue, 8 Aug 2023 00:59:25 +0000
Subject: [PATCH 1/4] Introduce Risc-V Vector Intrinsic Support

The Risc-V ISA has an optional "V" extension for vector support. This patch
introduces vector accelerated routines for the following methods:
 + local_lpc_compute_autocorrelation
 + local_lpc_compute_residual_from_qlp_coefficients

This patch disables building with Risc-V Vector support. It can be enabled
using `--enable-riscv-vector-optimizations` in autotools or `-DRISCV_VECTOR=ON`
using cmake.

Building with vector support is disabled by default because the patch was tested
only on QEMU for now for correctness.

Limitations:
 + RiscV vector support is limited to very modern compilers (Clang 16 or later)
   for the time being
 + The width of each vector element on Risc-V is configurable by the silicon
   vendor. This patch assumes a reasonable width of at least 128 bits per vector
   register for now.

Future Work:
 + Only local_lpc_compute_residual_from_qlp_coefficients has been optimized for
   now and the implementation was based heavily on the Intel AVX implementation.
   There is likely a more idiomatic Risc-V implementation that is feasible.
---
 CMakeLists.txt                    |   1 +
 Makefile.am                       |   2 +
 cmake/CheckCPUArch.cmake          |   4 +
 cmake/CheckRV64Vector.c.in        |   7 +
 cmake/CheckRV64Vector.cmake       |  15 +
 config.cmake.h.in                 |   6 +
 configure.ac                      |  34 +-
 src/libFLAC/CMakeLists.txt        |  12 +
 src/libFLAC/Makefile.am           |   1 +
 src/libFLAC/include/private/lpc.h |   9 +
 src/libFLAC/lpc_intrin_riscv.c    | 633 ++++++++++++++++++++++++++++++
 src/libFLAC/stream_encoder.c      |  13 +
 12 files changed, 736 insertions(+), 1 deletion(-)
 create mode 100644 cmake/CheckRV64Vector.c.in
 create mode 100644 cmake/CheckRV64Vector.cmake
 create mode 100644 src/libFLAC/lpc_intrin_riscv.c

diff --git a/CMakeLists.txt b/CMakeLists.txt
index fb23b7d963..ed770c5093 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -113,6 +113,7 @@ check_include_file("inttypes.h" HAVE_INTTYPES_H)
 check_include_file("stdint.h" HAVE_STDINT_H)
 check_include_file("stdbool.h" HAVE_STDBOOL_H)
 check_include_file("arm_neon.h" FLAC__HAS_NEONINTRIN)
+check_include_file("riscv_vector.h" FLAC__HAS_RISCVINTRIN)
 
 if(NOT HAVE_STDINT_H OR NOT HAVE_STDBOOL_H)
     message(SEND_ERROR "Header stdint.h and/or stdbool.h not found")
diff --git a/Makefile.am b/Makefile.am
index 3bb523e880..78c75518b3 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -43,6 +43,8 @@ EXTRA_DIST = \
 	flac-config.cmake.in \
 	cmake/CheckA64NEON.c.in \
 	cmake/CheckA64NEON.cmake \
+	cmake/CheckRV64Vector.c.in \
+	cmake/CheckRV64Vector.cmake \
 	cmake/CheckCPUArch.c.in \
 	cmake/CheckCPUArch.cmake \
 	cmake/FindOgg.cmake \
diff --git a/cmake/CheckCPUArch.cmake b/cmake/CheckCPUArch.cmake
index 665fa61501..4e41632481 100644
--- a/cmake/CheckCPUArch.cmake
+++ b/cmake/CheckCPUArch.cmake
@@ -25,3 +25,7 @@ endmacro(CHECK_CPU_ARCH_X86)
 macro(CHECK_CPU_ARCH_ARM64 VARIABLE)
     _CHECK_CPU_ARCH(arm64 "defined(__aarch64__) || defined(__arm64__)" ${VARIABLE})
 endmacro(CHECK_CPU_ARCH_ARM64)
+
+macro(CHECK_CPU_ARCH_RISCV64 VARIABLE)
+    _CHECK_CPU_ARCH(riscv64 "defined(__riscv)" ${VARIABLE})
+endmacro(CHECK_CPU_ARCH_RISCV64)
diff --git a/cmake/CheckRV64Vector.c.in b/cmake/CheckRV64Vector.c.in
new file mode 100644
index 0000000000..fc9662305d
--- /dev/null
+++ b/cmake/CheckRV64Vector.c.in
@@ -0,0 +1,7 @@
+#include <riscv_vector.h>
+int main (void)
+{
+    size_t vl = __riscv_vsetvl_e64m2(8);
+    vfloat64m2_t a = __riscv_vfmv_v_f_f64m2(0.5, vl);
+    return 0;
+}
diff --git a/cmake/CheckRV64Vector.cmake b/cmake/CheckRV64Vector.cmake
new file mode 100644
index 0000000000..955aab70dc
--- /dev/null
+++ b/cmake/CheckRV64Vector.cmake
@@ -0,0 +1,15 @@
+macro(CHECK_RV64VECTOR VARIABLE)
+    if(NOT DEFINED HAVE_${VARIABLE})
+        message(STATUS "Check whether RV64 Vector can be used")
+        configure_file(${PROJECT_SOURCE_DIR}/cmake/CheckRV64Vector.c.in ${PROJECT_BINARY_DIR}/CMakeFiles/CMakeTmp/CheckRV64Vector.c @ONLY)
+        try_compile(HAVE_${VARIABLE} "${PROJECT_BINARY_DIR}"
+            "${PROJECT_BINARY_DIR}/CMakeFiles/CMakeTmp/CheckRV64Vector.c")
+        if(HAVE_${VARIABLE})
+            message(STATUS "Check whether RV64 Vector can be used - yes")
+            set(${VARIABLE} 1 CACHE INTERNAL "Result of CHECK_RV64VECTOR" FORCE)
+        else ()
+            message(STATUS "Check whether RV64 Vector can be used - no")
+        endif()
+    endif ()
+endmacro(CHECK_RV64VECTOR)
+
diff --git a/config.cmake.h.in b/config.cmake.h.in
index acc73f0849..1de71e2144 100644
--- a/config.cmake.h.in
+++ b/config.cmake.h.in
@@ -9,6 +9,9 @@
 /* Target processor ARM64 */
 #cmakedefine FLAC__CPU_ARM64
 
+/* Target processor RiscV64 */
+#cmakedefine FLAC__CPU_RISCV64
+
 /* Set FLAC__BYTES_PER_WORD to 8 (4 is the default) */
 #cmakedefine01 ENABLE_64_BIT_WORDS
 
@@ -31,6 +34,9 @@
 /* Set to 1 if <arm_neon.h> contains A64 intrinsics */
 #cmakedefine01 FLAC__HAS_A64NEONINTRIN
 
+/* Set to 1 if <riscv_vector.h> is available. */
+#cmakedefine01 FLAC__HAS_RISCVINTRIN
+
 /* define if building for Darwin / MacOS X */
 #cmakedefine FLAC__SYS_DARWIN
 
diff --git a/configure.ac b/configure.ac
index 418677c7a6..49cfbdb252 100644
--- a/configure.ac
+++ b/configure.ac
@@ -53,7 +53,7 @@ AM_PROG_CC_C_O
 AC_C_INLINE
 AC_C_TYPEOF
 
-AC_CHECK_HEADERS([stdint.h stdbool.h inttypes.h byteswap.h sys/auxv.h sys/param.h sys/ioctl.h termios.h x86intrin.h cpuid.h arm_neon.h])
+AC_CHECK_HEADERS([stdint.h stdbool.h inttypes.h byteswap.h sys/auxv.h sys/param.h sys/ioctl.h termios.h x86intrin.h cpuid.h arm_neon.h riscv_vector.h])
 
 if test "x$ac_cv_header_stdint_h" != xyes -o "x$ac_cv_header_stdbool_h" != xyes; then
 AC_MSG_ERROR("Header stdint.h and/or stdbool.h not found")
@@ -86,6 +86,13 @@ AC_DEFINE(FLAC__NO_ASM)
 AH_TEMPLATE(FLAC__NO_ASM, [define to disable use of assembly code])
 fi
 
+AC_ARG_ENABLE(riscv-vector-optimizations, AS_HELP_STRING([--enable-riscv-vector-optimizations],[Enable RiscV Vector Optimization Routines]), riscv_vector_opt=yes, riscv_vector_opt=no)
+AM_CONDITIONAL(FLAC__RISCV_VECTOR, test "x$riscv_vector_opt" = xyes)
+if test "x$riscv_vector_opt" = xyes ; then
+AC_DEFINE(FLAC__RISCV_VECTOR)
+AH_TEMPLATE(FLAC__RISCV_VECTOR, [define to enable use riscv vector extensions])
+fi
+
 dnl check for getauxval in standard library
 AC_CHECK_FUNCS(getauxval)
 
@@ -131,10 +138,16 @@ case "$host_cpu" in
 		AC_DEFINE(FLAC__CPU_ARM64)
 		AH_TEMPLATE(FLAC__CPU_ARM64, [define if building for ARM])
 		;;
+	riscv64)
+		cpu_riscv64=true
+		AC_DEFINE(FLAC__CPU_RISCV64)
+		AH_TEMPLATE(FLAC__CPU_RISCV64, [define if building for Riscv64])
+		;;
 esac
 AM_CONDITIONAL(FLAC__CPU_X86_64, test "x$cpu_x86_64" = xtrue)
 AM_CONDITIONAL(FLaC__CPU_IA32, test "x$cpu_ia32" = xtrue)
 AM_CONDITIONAL(FLAC__CPU_ARM64, test "x$cpu_arm64" = xtrue)
+AM_CONDITIONAL(FLAC__CPU_RISCV64, test "x$cpu_riscv64" = xtrue)
 
 if test "x$ac_cv_header_x86intrin_h" = xyes -a "x$asm_opt" = xyes; then
   AC_DEFINE([FLAC__HAS_X86INTRIN], 1, [Set to 1 if <x86intrin.h> is available.])
@@ -162,6 +175,25 @@ else
   AC_DEFINE([FLAC__HAS_NEONINTRIN], 0)
 fi
 
+if test "x$ac_cv_header_riscv_vector_h" = xyes -a "x$asm_opt" = xyes; then
+  AC_DEFINE([FLAC__HAS_RISCVINTRIN], 1, [Set to 1 if <riscv_vector.h> is available.])
+  AC_MSG_CHECKING([whether riscv_vector.h has Vector functions])
+  AC_COMPILE_IFELSE(
+	[AC_LANG_PROGRAM([[#include <riscv_vector.h>]],
+	[[size_t vl = __riscv_vsetvl_e64m2(8); vfloat64m2_t a = __riscv_vfmv_v_f_f64m2(0.5, vl);]])],
+	[AC_MSG_RESULT([yes])
+	 has_riscvvector=yes],
+	[AC_MSG_RESULT([no])])
+  if test "x$has_riscvvector" = xyes; then
+    AC_DEFINE([FLAC__HAS_RISCVINTRIN], 1, [Set to 1 if <riscv_vector.h> has vector instructions.])
+    asm_optimisation=yes
+  else
+    AC_DEFINE([FLAC__HAS_RISCVINTRIN], 0)
+  fi
+else
+  AC_DEFINE([FLAC__HAS_RISCVINTRIN], 0)
+fi
+
 case "$host" in
 	i386-*-openbsd3.[[0-3]]) OBJ_FORMAT=aoutb ;;
 	*-*-cygwin|*mingw*) OBJ_FORMAT=win32 ;;
diff --git a/src/libFLAC/CMakeLists.txt b/src/libFLAC/CMakeLists.txt
index cf7368f60c..3894649b56 100644
--- a/src/libFLAC/CMakeLists.txt
+++ b/src/libFLAC/CMakeLists.txt
@@ -1,4 +1,5 @@
 option(WITH_ASM "Use any assembly optimization routines" ON)
+option(RISCV_VECTOR "Use RiscV Vector Optimization" OFF)
 
 check_include_file("cpuid.h" HAVE_CPUID_H)
 check_include_file("sys/param.h" HAVE_SYS_PARAM_H)
@@ -9,6 +10,7 @@ check_function_exists(lround HAVE_LROUND)
 include(CheckCSourceCompiles)
 include(CheckCPUArch)
 include(CheckA64NEON)
+include(CheckRV64Vector)
 
 check_cpu_arch_x64(FLAC__CPU_X86_64)
 if(NOT FLAC__CPU_X86_64)
@@ -26,12 +28,21 @@ else()
     if(FLAC__CPU_ARM64)
          check_a64neon(FLAC__HAS_A64NEONINTRIN)
     endif()
+
+    check_cpu_arch_riscv64(FLAC__CPU_RISCV64)
+    if(FLAC__CPU_RISCV64)
+         check_rv64vector(FLAC__HAS_RISCVINTRIN)
+    endif()
 endif()
 
 if(NOT WITH_ASM)
     add_definitions(-DFLAC__NO_ASM)
 endif()
 
+if(RISCV_VECTOR)
+    add_definitions(-DFLAC__RISCV_VECTOR)
+endif()
+
 include_directories("include")
 
 add_library(FLAC
@@ -53,6 +64,7 @@ add_library(FLAC
     lpc_intrin_sse41.c
     lpc_intrin_avx2.c
     lpc_intrin_fma.c
+    lpc_intrin_riscv.c
     md5.c
     memory.c
     metadata_iterators.c
diff --git a/src/libFLAC/Makefile.am b/src/libFLAC/Makefile.am
index 618939dfc3..a4433cc23a 100644
--- a/src/libFLAC/Makefile.am
+++ b/src/libFLAC/Makefile.am
@@ -100,6 +100,7 @@ libFLAC_sources = \
 	lpc_intrin_avx2.c \
 	lpc_intrin_fma.c \
 	lpc_intrin_neon.c \
+	lpc_intrin_riscv.c \
 	md5.c \
 	memory.c \
 	metadata_iterators.c \
diff --git a/src/libFLAC/include/private/lpc.h b/src/libFLAC/include/private/lpc.h
index 766f0560ea..93051fdeb2 100644
--- a/src/libFLAC/include/private/lpc.h
+++ b/src/libFLAC/include/private/lpc.h
@@ -92,6 +92,9 @@ void FLAC__lpc_compute_autocorrelation_intrin_neon_lag_8(const FLAC__real data[]
 void FLAC__lpc_compute_autocorrelation_intrin_neon_lag_10(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[]);
 void FLAC__lpc_compute_autocorrelation_intrin_neon_lag_14(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[]);
 #endif
+#if defined FLAC__CPU_RISCV64 && FLAC__HAS_RISCVINTRIN
+void FLAC__lpc_compute_autocorrelation_intrin_riscv(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[]);
+#endif
 #endif /* FLAC__NO_ASM */
 
 /*
@@ -162,6 +165,12 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_neon(const FLAC__in
 void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_neon(const FLAC__int32 *data, uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 residual[]);
 #   endif
 
+#ifdef FLAC__CPU_RISCV64
+// void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_riscv(const FLAC__int32 *data, uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 residual[]);
+void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_riscv(const FLAC__int32 *data, uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 residual[]);
+// void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_riscv(const FLAC__int32 *data, uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 residual[]);
+#endif
+
 #  if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN
 #    ifdef FLAC__SSE2_SUPPORTED
 void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC__int32 *data, uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 residual[]);
diff --git a/src/libFLAC/lpc_intrin_riscv.c b/src/libFLAC/lpc_intrin_riscv.c
new file mode 100644
index 0000000000..a1fad98060
--- /dev/null
+++ b/src/libFLAC/lpc_intrin_riscv.c
@@ -0,0 +1,633 @@
+/* libFLAC - Free Lossless Audio Codec library
+ * Copyright (C) 2000-2009  Josh Coalson
+ * Copyright (C) 2011-2023  Xiph.Org Foundation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * - Neither the name of the Xiph.org Foundation nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "private/cpu.h"
+
+#ifndef FLAC__INTEGER_ONLY_LIBRARY
+#ifndef FLAC__NO_ASM
+#ifdef FLAC__RISCV_VECTOR
+#if defined FLAC__CPU_RISCV64 && FLAC__HAS_RISCVINTRIN
+#include "private/lpc.h"
+#include "FLAC/assert.h"
+#include "FLAC/format.h"
+#include "private/macros.h"
+#include <riscv_vector.h>
+#include <assert.h>
+
+void FLAC__lpc_compute_autocorrelation_intrin_riscv(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[])
+{
+	uint32_t i;
+	vfloat64m8_t sample;
+
+	// Set LMUL=8 to group vector registers into groups of 8. Assuming a VLEN of at
+	// least 128b that should yield enough space for 128x8=1024b / vector register.
+	// 1024b is enough for 16 Float64s so we should be able to process the entire lag
+	// window with a single virtual register.
+	size_t vl = __riscv_vsetvl_e64m8(lag);
+
+	vfloat64m8_t sums = __riscv_vfmv_v_f_f64m8(0.0, vl);
+	vfloat64m8_t d = __riscv_vfmv_v_f_f64m8(0.0, vl);
+
+	FLAC__ASSERT(vl == lag);
+
+	for(i = 0; i < data_len; i++) {
+		const double new_sample = data[i];
+		sample = __riscv_vfmv_v_f_f64m8(new_sample, vl);
+		d = __riscv_vfslide1up_vf_f64m8(d, new_sample, vl);
+		sums = __riscv_vfmacc_vv_f64m8(sums, sample, d, vl);
+	}
+	__riscv_vse64_v_f64m8(autoc, sums, vl);
+}
+
+void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_riscv(const FLAC__int32 *data, uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 residual[])
+{
+	int i;
+	FLAC__int32 sum;
+	vint32m1_t data_vec;
+	vint32m1_t q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11;
+	vint32m1_t d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11;
+	vint32m1_t summ, mull;
+	size_t vl = __riscv_vsetvl_e32m1(4);
+
+	// TODO(gkalsi): Fix this here?
+	FLAC__ASSERT(vl == 4);
+
+	FLAC__ASSERT(order > 0);
+	FLAC__ASSERT(order <= 32);
+
+	if(order <= 12) {
+		if(order > 8) {
+			if(order > 10) {
+				if(order == 12) {
+					q0 = __riscv_vmv_v_x_i32m1(qlp_coeff[0], vl);
+					q1 = __riscv_vmv_v_x_i32m1(qlp_coeff[1], vl);
+					q2 = __riscv_vmv_v_x_i32m1(qlp_coeff[2], vl);
+					q3 = __riscv_vmv_v_x_i32m1(qlp_coeff[3], vl);
+					q4 = __riscv_vmv_v_x_i32m1(qlp_coeff[4], vl);
+					q5 = __riscv_vmv_v_x_i32m1(qlp_coeff[5], vl);
+					q6 = __riscv_vmv_v_x_i32m1(qlp_coeff[6], vl);
+					q7 = __riscv_vmv_v_x_i32m1(qlp_coeff[7], vl);
+					q8 = __riscv_vmv_v_x_i32m1(qlp_coeff[8], vl);
+					q9 = __riscv_vmv_v_x_i32m1(qlp_coeff[9], vl);
+					q10 = __riscv_vmv_v_x_i32m1(qlp_coeff[10], vl);
+					q11 = __riscv_vmv_v_x_i32m1(qlp_coeff[11], vl);
+
+					for(i = 0; i < (int)data_len - 3; i += 4) {
+						d0 = __riscv_vle32_v_i32m1(data + i - 1, vl);
+						d1 = __riscv_vle32_v_i32m1(data + i - 2, vl);
+						d2 = __riscv_vle32_v_i32m1(data + i - 3, vl);
+						d3 = __riscv_vle32_v_i32m1(data + i - 4, vl);
+						d4 = __riscv_vle32_v_i32m1(data + i - 5, vl);
+						d5 = __riscv_vle32_v_i32m1(data + i - 6, vl);
+						d6 = __riscv_vle32_v_i32m1(data + i - 7, vl);
+						d7 = __riscv_vle32_v_i32m1(data + i - 8, vl);
+						d8 = __riscv_vle32_v_i32m1(data + i - 9, vl);
+						d9 = __riscv_vle32_v_i32m1(data + i - 10, vl);
+						d10 = __riscv_vle32_v_i32m1(data + i - 11, vl);
+						d11 = __riscv_vle32_v_i32m1(data + i - 12, vl);
+
+						summ = __riscv_vmul_vv_i32m1(q11, d11, vl);
+						mull = __riscv_vmul_vv_i32m1(q10, d10, vl);
+						summ = __riscv_vadd_vv_i32m1(summ, mull, vl);
+						mull = __riscv_vmul_vv_i32m1(q9, d9, vl);
+						summ = __riscv_vadd_vv_i32m1(summ, mull, vl);
+						mull = __riscv_vmul_vv_i32m1(q8, d8, vl);
+						summ = __riscv_vadd_vv_i32m1(summ, mull, vl);
+						mull = __riscv_vmul_vv_i32m1(q7, d7, vl);
+						summ = __riscv_vadd_vv_i32m1(summ, mull, vl);
+						mull = __riscv_vmul_vv_i32m1(q6, d6, vl);
+						summ = __riscv_vadd_vv_i32m1(summ, mull, vl);
+						mull = __riscv_vmul_vv_i32m1(q5, d5, vl);
+						summ = __riscv_vadd_vv_i32m1(summ, mull, vl);
+						mull = __riscv_vmul_vv_i32m1(q4, d4, vl);
+						summ = __riscv_vadd_vv_i32m1(summ, mull, vl);
+						mull = __riscv_vmul_vv_i32m1(q3, d3, vl);
+						summ = __riscv_vadd_vv_i32m1(summ, mull, vl);
+						mull = __riscv_vmul_vv_i32m1(q2, d2, vl);
+						summ = __riscv_vadd_vv_i32m1(summ, mull, vl);
+						mull = __riscv_vmul_vv_i32m1(q1, d1, vl);
+						summ = __riscv_vadd_vv_i32m1(summ, mull, vl);
+						mull = __riscv_vmul_vv_i32m1(q0, d0, vl);
+						summ = __riscv_vadd_vv_i32m1(summ, mull, vl);
+						summ = __riscv_vsra_vx_i32m1(summ, lp_quantization, vl);
+
+						data_vec = __riscv_vle32_v_i32m1(data + i, vl);
+						data_vec = __riscv_vsub_vv_i32m1(data_vec, summ, vl);
+						__riscv_vse32_v_i32m1(residual + i, data_vec, vl);
+					}
+				}
+				else { /* order == 11 */
+					q0 = __riscv_vmv_v_x_i32m1(qlp_coeff[0], vl);
+					q1 = __riscv_vmv_v_x_i32m1(qlp_coeff[1], vl);
+					q2 = __riscv_vmv_v_x_i32m1(qlp_coeff[2], vl);
+					q3 = __riscv_vmv_v_x_i32m1(qlp_coeff[3], vl);
+					q4 = __riscv_vmv_v_x_i32m1(qlp_coeff[4], vl);
+					q5 = __riscv_vmv_v_x_i32m1(qlp_coeff[5], vl);
+					q6 = __riscv_vmv_v_x_i32m1(qlp_coeff[6], vl);
+					q7 = __riscv_vmv_v_x_i32m1(qlp_coeff[7], vl);
+					q8 = __riscv_vmv_v_x_i32m1(qlp_coeff[8], vl);
+					q9 = __riscv_vmv_v_x_i32m1(qlp_coeff[9], vl);
+					q10 = __riscv_vmv_v_x_i32m1(qlp_coeff[10], vl);
+
+					for(i = 0; i < (int)data_len - 3; i += 4) {
+						d0 = __riscv_vle32_v_i32m1(data + i - 1, vl);
+						d1 = __riscv_vle32_v_i32m1(data + i - 2, vl);
+						d2 = __riscv_vle32_v_i32m1(data + i - 3, vl);
+						d3 = __riscv_vle32_v_i32m1(data + i - 4, vl);
+						d4 = __riscv_vle32_v_i32m1(data + i - 5, vl);
+						d5 = __riscv_vle32_v_i32m1(data + i - 6, vl);
+						d6 = __riscv_vle32_v_i32m1(data + i - 7, vl);
+						d7 = __riscv_vle32_v_i32m1(data + i - 8, vl);
+						d8 = __riscv_vle32_v_i32m1(data + i - 9, vl);
+						d9 = __riscv_vle32_v_i32m1(data + i - 10, vl);
+						d10 = __riscv_vle32_v_i32m1(data + i - 11, vl);
+
+						summ = __riscv_vmul_vv_i32m1(q10, d10, vl);
+						mull = __riscv_vmul_vv_i32m1(q9, d9, vl);
+						summ = __riscv_vadd_vv_i32m1(summ, mull, vl);
+						mull = __riscv_vmul_vv_i32m1(q8, d8, vl);
+						summ = __riscv_vadd_vv_i32m1(summ, mull, vl);
+						mull = __riscv_vmul_vv_i32m1(q7, d7, vl);
+						summ = __riscv_vadd_vv_i32m1(summ, mull, vl);
+						mull = __riscv_vmul_vv_i32m1(q6, d6, vl);
+						summ = __riscv_vadd_vv_i32m1(summ, mull, vl);
+						mull = __riscv_vmul_vv_i32m1(q5, d5, vl);
+						summ = __riscv_vadd_vv_i32m1(summ, mull, vl);
+						mull = __riscv_vmul_vv_i32m1(q4, d4, vl);
+						summ = __riscv_vadd_vv_i32m1(summ, mull, vl);
+						mull = __riscv_vmul_vv_i32m1(q3, d3, vl);
+						summ = __riscv_vadd_vv_i32m1(summ, mull, vl);
+						mull = __riscv_vmul_vv_i32m1(q2, d2, vl);
+						summ = __riscv_vadd_vv_i32m1(summ, mull, vl);
+						mull = __riscv_vmul_vv_i32m1(q1, d1, vl);
+						summ = __riscv_vadd_vv_i32m1(summ, mull, vl);
+						mull = __riscv_vmul_vv_i32m1(q0, d0, vl);
+						summ = __riscv_vadd_vv_i32m1(summ, mull, vl);
+						summ = __riscv_vsra_vx_i32m1(summ, lp_quantization, vl);
+
+						data_vec = __riscv_vle32_v_i32m1(data + i, vl);
+						data_vec = __riscv_vsub_vv_i32m1(data_vec, summ, vl);
+						__riscv_vse32_v_i32m1(residual + i, data_vec, vl);
+					}
+				}
+			}
+			else {
+				if(order == 10) {
+					q0 = __riscv_vmv_v_x_i32m1(qlp_coeff[0], vl);
+					q1 = __riscv_vmv_v_x_i32m1(qlp_coeff[1], vl);
+					q2 = __riscv_vmv_v_x_i32m1(qlp_coeff[2], vl);
+					q3 = __riscv_vmv_v_x_i32m1(qlp_coeff[3], vl);
+					q4 = __riscv_vmv_v_x_i32m1(qlp_coeff[4], vl);
+					q5 = __riscv_vmv_v_x_i32m1(qlp_coeff[5], vl);
+					q6 = __riscv_vmv_v_x_i32m1(qlp_coeff[6], vl);
+					q7 = __riscv_vmv_v_x_i32m1(qlp_coeff[7], vl);
+					q8 = __riscv_vmv_v_x_i32m1(qlp_coeff[8], vl);
+					q9 = __riscv_vmv_v_x_i32m1(qlp_coeff[9], vl);
+
+					for(i = 0; i < (int)data_len - 3; i += 4) {
+						d0 = __riscv_vle32_v_i32m1(data + i - 1, vl);
+						d1 = __riscv_vle32_v_i32m1(data + i - 2, vl);
+						d2 = __riscv_vle32_v_i32m1(data + i - 3, vl);
+						d3 = __riscv_vle32_v_i32m1(data + i - 4, vl);
+						d4 = __riscv_vle32_v_i32m1(data + i - 5, vl);
+						d5 = __riscv_vle32_v_i32m1(data + i - 6, vl);
+						d6 = __riscv_vle32_v_i32m1(data + i - 7, vl);
+						d7 = __riscv_vle32_v_i32m1(data + i - 8, vl);
+						d8 = __riscv_vle32_v_i32m1(data + i - 9, vl);
+						d9 = __riscv_vle32_v_i32m1(data + i - 10, vl);
+
+						summ = __riscv_vmul_vv_i32m1(q9, d9, vl);
+						mull = __riscv_vmul_vv_i32m1(q8, d8, vl);
+						summ = __riscv_vadd_vv_i32m1(summ, mull, vl);
+						mull = __riscv_vmul_vv_i32m1(q7, d7, vl);
+						summ = __riscv_vadd_vv_i32m1(summ, mull, vl);
+						mull = __riscv_vmul_vv_i32m1(q6, d6, vl);
+						summ = __riscv_vadd_vv_i32m1(summ, mull, vl);
+						mull = __riscv_vmul_vv_i32m1(q5, d5, vl);
+						summ = __riscv_vadd_vv_i32m1(summ, mull, vl);
+						mull = __riscv_vmul_vv_i32m1(q4, d4, vl);
+						summ = __riscv_vadd_vv_i32m1(summ, mull, vl);
+						mull = __riscv_vmul_vv_i32m1(q3, d3, vl);
+						summ = __riscv_vadd_vv_i32m1(summ, mull, vl);
+						mull = __riscv_vmul_vv_i32m1(q2, d2, vl);
+						summ = __riscv_vadd_vv_i32m1(summ, mull, vl);
+						mull = __riscv_vmul_vv_i32m1(q1, d1, vl);
+						summ = __riscv_vadd_vv_i32m1(summ, mull, vl);
+						mull = __riscv_vmul_vv_i32m1(q0, d0, vl);
+						summ = __riscv_vadd_vv_i32m1(summ, mull, vl);
+						summ = __riscv_vsra_vx_i32m1(summ, lp_quantization, vl);
+
+						data_vec = __riscv_vle32_v_i32m1(data + i, vl);
+						data_vec = __riscv_vsub_vv_i32m1(data_vec, summ, vl);
+						__riscv_vse32_v_i32m1(residual + i, data_vec, vl);
+					}
+				}
+				else { /* order == 9 */
+					q0 = __riscv_vmv_v_x_i32m1(qlp_coeff[0], vl);
+					q1 = __riscv_vmv_v_x_i32m1(qlp_coeff[1], vl);
+					q2 = __riscv_vmv_v_x_i32m1(qlp_coeff[2], vl);
+					q3 = __riscv_vmv_v_x_i32m1(qlp_coeff[3], vl);
+					q4 = __riscv_vmv_v_x_i32m1(qlp_coeff[4], vl);
+					q5 = __riscv_vmv_v_x_i32m1(qlp_coeff[5], vl);
+					q6 = __riscv_vmv_v_x_i32m1(qlp_coeff[6], vl);
+					q7 = __riscv_vmv_v_x_i32m1(qlp_coeff[7], vl);
+					q8 = __riscv_vmv_v_x_i32m1(qlp_coeff[8], vl);
+
+					for(i = 0; i < (int)data_len - 3; i += 4) {
+						d0 = __riscv_vle32_v_i32m1(data + i - 1, vl);
+						d1 = __riscv_vle32_v_i32m1(data + i - 2, vl);
+						d2 = __riscv_vle32_v_i32m1(data + i - 3, vl);
+						d3 = __riscv_vle32_v_i32m1(data + i - 4, vl);
+						d4 = __riscv_vle32_v_i32m1(data + i - 5, vl);
+						d5 = __riscv_vle32_v_i32m1(data + i - 6, vl);
+						d6 = __riscv_vle32_v_i32m1(data + i - 7, vl);
+						d7 = __riscv_vle32_v_i32m1(data + i - 8, vl);
+						d8 = __riscv_vle32_v_i32m1(data + i - 9, vl);
+
+						summ = __riscv_vmul_vv_i32m1(q8, d8, vl);
+						mull = __riscv_vmul_vv_i32m1(q7, d7, vl);
+						summ = __riscv_vadd_vv_i32m1(summ, mull, vl);
+						mull = __riscv_vmul_vv_i32m1(q6, d6, vl);
+						summ = __riscv_vadd_vv_i32m1(summ, mull, vl);
+						mull = __riscv_vmul_vv_i32m1(q5, d5, vl);
+						summ = __riscv_vadd_vv_i32m1(summ, mull, vl);
+						mull = __riscv_vmul_vv_i32m1(q4, d4, vl);
+						summ = __riscv_vadd_vv_i32m1(summ, mull, vl);
+						mull = __riscv_vmul_vv_i32m1(q3, d3, vl);
+						summ = __riscv_vadd_vv_i32m1(summ, mull, vl);
+						mull = __riscv_vmul_vv_i32m1(q2, d2, vl);
+						summ = __riscv_vadd_vv_i32m1(summ, mull, vl);
+						mull = __riscv_vmul_vv_i32m1(q1, d1, vl);
+						summ = __riscv_vadd_vv_i32m1(summ, mull, vl);
+						mull = __riscv_vmul_vv_i32m1(q0, d0, vl);
+						summ = __riscv_vadd_vv_i32m1(summ, mull, vl);
+						summ = __riscv_vsra_vx_i32m1(summ, lp_quantization, vl);
+
+						data_vec = __riscv_vle32_v_i32m1(data + i, vl);
+						data_vec = __riscv_vsub_vv_i32m1(data_vec, summ, vl);
+						__riscv_vse32_v_i32m1(residual + i, data_vec, vl);
+					}
+				}
+			}
+		}
+		else if(order > 4) {
+			if(order > 6) {
+				if(order == 8) {
+					q0 = __riscv_vmv_v_x_i32m1(qlp_coeff[0], vl);
+					q1 = __riscv_vmv_v_x_i32m1(qlp_coeff[1], vl);
+					q2 = __riscv_vmv_v_x_i32m1(qlp_coeff[2], vl);
+					q3 = __riscv_vmv_v_x_i32m1(qlp_coeff[3], vl);
+					q4 = __riscv_vmv_v_x_i32m1(qlp_coeff[4], vl);
+					q5 = __riscv_vmv_v_x_i32m1(qlp_coeff[5], vl);
+					q6 = __riscv_vmv_v_x_i32m1(qlp_coeff[6], vl);
+					q7 = __riscv_vmv_v_x_i32m1(qlp_coeff[7], vl);
+
+					for(i = 0; i < (int)data_len - 3; i += 4) {
+						d0 = __riscv_vle32_v_i32m1(data + i - 1, vl);
+						d1 = __riscv_vle32_v_i32m1(data + i - 2, vl);
+						d2 = __riscv_vle32_v_i32m1(data + i - 3, vl);
+						d3 = __riscv_vle32_v_i32m1(data + i - 4, vl);
+						d4 = __riscv_vle32_v_i32m1(data + i - 5, vl);
+						d5 = __riscv_vle32_v_i32m1(data + i - 6, vl);
+						d6 = __riscv_vle32_v_i32m1(data + i - 7, vl);
+						d7 = __riscv_vle32_v_i32m1(data + i - 8, vl);
+
+						summ = __riscv_vmul_vv_i32m1(q7, d7, vl);
+						mull = __riscv_vmul_vv_i32m1(q6, d6, vl);
+						summ = __riscv_vadd_vv_i32m1(summ, mull, vl);
+						mull = __riscv_vmul_vv_i32m1(q5, d5, vl);
+						summ = __riscv_vadd_vv_i32m1(summ, mull, vl);
+						mull = __riscv_vmul_vv_i32m1(q4, d4, vl);
+						summ = __riscv_vadd_vv_i32m1(summ, mull, vl);
+						mull = __riscv_vmul_vv_i32m1(q3, d3, vl);
+						summ = __riscv_vadd_vv_i32m1(summ, mull, vl);
+						mull = __riscv_vmul_vv_i32m1(q2, d2, vl);
+						summ = __riscv_vadd_vv_i32m1(summ, mull, vl);
+						mull = __riscv_vmul_vv_i32m1(q1, d1, vl);
+						summ = __riscv_vadd_vv_i32m1(summ, mull, vl);
+						mull = __riscv_vmul_vv_i32m1(q0, d0, vl);
+						summ = __riscv_vadd_vv_i32m1(summ, mull, vl);
+						summ = __riscv_vsra_vx_i32m1(summ, lp_quantization, vl);
+
+						data_vec = __riscv_vle32_v_i32m1(data + i, vl);
+						data_vec = __riscv_vsub_vv_i32m1(data_vec, summ, vl);
+						__riscv_vse32_v_i32m1(residual + i, data_vec, vl);
+					}
+				}
+				else { /* order == 7 */
+					q0 = __riscv_vmv_v_x_i32m1(qlp_coeff[0], vl);
+					q1 = __riscv_vmv_v_x_i32m1(qlp_coeff[1], vl);
+					q2 = __riscv_vmv_v_x_i32m1(qlp_coeff[2], vl);
+					q3 = __riscv_vmv_v_x_i32m1(qlp_coeff[3], vl);
+					q4 = __riscv_vmv_v_x_i32m1(qlp_coeff[4], vl);
+					q5 = __riscv_vmv_v_x_i32m1(qlp_coeff[5], vl);
+					q6 = __riscv_vmv_v_x_i32m1(qlp_coeff[6], vl);
+
+					for(i = 0; i < (int)data_len - 3; i += 4) {
+						d0 = __riscv_vle32_v_i32m1(data + i - 1, vl);
+						d1 = __riscv_vle32_v_i32m1(data + i - 2, vl);
+						d2 = __riscv_vle32_v_i32m1(data + i - 3, vl);
+						d3 = __riscv_vle32_v_i32m1(data + i - 4, vl);
+						d4 = __riscv_vle32_v_i32m1(data + i - 5, vl);
+						d5 = __riscv_vle32_v_i32m1(data + i - 6, vl);
+						d6 = __riscv_vle32_v_i32m1(data + i - 7, vl);
+						summ = __riscv_vmul_vv_i32m1(q6, d6, vl);
+						mull = __riscv_vmul_vv_i32m1(q5, d5, vl);
+						summ = __riscv_vadd_vv_i32m1(summ, mull, vl);
+						mull = __riscv_vmul_vv_i32m1(q4, d4, vl);
+						summ = __riscv_vadd_vv_i32m1(summ, mull, vl);
+						mull = __riscv_vmul_vv_i32m1(q3, d3, vl);
+						summ = __riscv_vadd_vv_i32m1(summ, mull, vl);
+						mull = __riscv_vmul_vv_i32m1(q2, d2, vl);
+						summ = __riscv_vadd_vv_i32m1(summ, mull, vl);
+						mull = __riscv_vmul_vv_i32m1(q1, d1, vl);
+						summ = __riscv_vadd_vv_i32m1(summ, mull, vl);
+						mull = __riscv_vmul_vv_i32m1(q0, d0, vl);
+						summ = __riscv_vadd_vv_i32m1(summ, mull, vl);
+						summ = __riscv_vsra_vx_i32m1(summ, lp_quantization, vl);
+
+						data_vec = __riscv_vle32_v_i32m1(data + i, vl);
+						data_vec = __riscv_vsub_vv_i32m1(data_vec, summ, vl);
+						__riscv_vse32_v_i32m1(residual + i, data_vec, vl);
+					}
+				}
+			}
+			else {
+				if(order == 6) {
+					q0 = __riscv_vmv_v_x_i32m1(qlp_coeff[0], vl);
+					q1 = __riscv_vmv_v_x_i32m1(qlp_coeff[1], vl);
+					q2 = __riscv_vmv_v_x_i32m1(qlp_coeff[2], vl);
+					q3 = __riscv_vmv_v_x_i32m1(qlp_coeff[3], vl);
+					q4 = __riscv_vmv_v_x_i32m1(qlp_coeff[4], vl);
+					q5 = __riscv_vmv_v_x_i32m1(qlp_coeff[5], vl);
+
+					for(i = 0; i < (int)data_len - 3; i += 4) {
+						d0 = __riscv_vle32_v_i32m1(data + i - 1, vl);
+						d1 = __riscv_vle32_v_i32m1(data + i - 2, vl);
+						d2 = __riscv_vle32_v_i32m1(data + i - 3, vl);
+						d3 = __riscv_vle32_v_i32m1(data + i - 4, vl);
+						d4 = __riscv_vle32_v_i32m1(data + i - 5, vl);
+						d5 = __riscv_vle32_v_i32m1(data + i - 6, vl);
+
+						summ = __riscv_vmul_vv_i32m1(q5, d5, vl);
+						mull = __riscv_vmul_vv_i32m1(q4, d4, vl);
+						summ = __riscv_vadd_vv_i32m1(summ, mull, vl);
+						mull = __riscv_vmul_vv_i32m1(q3, d3, vl);
+						summ = __riscv_vadd_vv_i32m1(summ, mull, vl);
+						mull = __riscv_vmul_vv_i32m1(q2, d2, vl);
+						summ = __riscv_vadd_vv_i32m1(summ, mull, vl);
+						mull = __riscv_vmul_vv_i32m1(q1, d1, vl);
+						summ = __riscv_vadd_vv_i32m1(summ, mull, vl);
+						mull = __riscv_vmul_vv_i32m1(q0, d0, vl);
+						summ = __riscv_vadd_vv_i32m1(summ, mull, vl);
+						summ = __riscv_vsra_vx_i32m1(summ, lp_quantization, vl);
+
+						data_vec = __riscv_vle32_v_i32m1(data + i, vl);
+						data_vec = __riscv_vsub_vv_i32m1(data_vec, summ, vl);
+						__riscv_vse32_v_i32m1(residual + i, data_vec, vl);
+					}
+				}
+				else { /* order == 5 */
+					q0 = __riscv_vmv_v_x_i32m1(qlp_coeff[0], vl);
+					q1 = __riscv_vmv_v_x_i32m1(qlp_coeff[1], vl);
+					q2 = __riscv_vmv_v_x_i32m1(qlp_coeff[2], vl);
+					q3 = __riscv_vmv_v_x_i32m1(qlp_coeff[3], vl);
+					q4 = __riscv_vmv_v_x_i32m1(qlp_coeff[4], vl);
+
+					for(i = 0; i < (int)data_len - 3; i += 4) {
+						d0 = __riscv_vle32_v_i32m1(data + i - 1, vl);
+						d1 = __riscv_vle32_v_i32m1(data + i - 2, vl);
+						d2 = __riscv_vle32_v_i32m1(data + i - 3, vl);
+						d3 = __riscv_vle32_v_i32m1(data + i - 4, vl);
+						d4 = __riscv_vle32_v_i32m1(data + i - 5, vl);
+
+						summ = __riscv_vmul_vv_i32m1(q4, d4, vl);
+						mull = __riscv_vmul_vv_i32m1(q3, d3, vl);
+						summ = __riscv_vadd_vv_i32m1(summ, mull, vl);
+						mull = __riscv_vmul_vv_i32m1(q2, d2, vl);
+						summ = __riscv_vadd_vv_i32m1(summ, mull, vl);
+						mull = __riscv_vmul_vv_i32m1(q1, d1, vl);
+						summ = __riscv_vadd_vv_i32m1(summ, mull, vl);
+						mull = __riscv_vmul_vv_i32m1(q0, d0, vl);
+						summ = __riscv_vadd_vv_i32m1(summ, mull, vl);
+						summ = __riscv_vsra_vx_i32m1(summ, lp_quantization, vl);
+
+						data_vec = __riscv_vle32_v_i32m1(data + i, vl);
+						data_vec = __riscv_vsub_vv_i32m1(data_vec, summ, vl);
+						__riscv_vse32_v_i32m1(residual + i, data_vec, vl);
+					}
+				}
+			}
+		}
+		else {
+			if(order > 2) {
+				if(order == 4) {
+					q0 = __riscv_vmv_v_x_i32m1(qlp_coeff[0], vl);
+					q1 = __riscv_vmv_v_x_i32m1(qlp_coeff[1], vl);
+					q2 = __riscv_vmv_v_x_i32m1(qlp_coeff[2], vl);
+					q3 = __riscv_vmv_v_x_i32m1(qlp_coeff[3], vl);
+
+					for(i = 0; i < (int)data_len - 3; i += 4) {
+						d0 = __riscv_vle32_v_i32m1(data + i - 1, vl);
+						d1 = __riscv_vle32_v_i32m1(data + i - 2, vl);
+						d2 = __riscv_vle32_v_i32m1(data + i - 3, vl);
+						d3 = __riscv_vle32_v_i32m1(data + i - 4, vl);
+
+						summ = __riscv_vmul_vv_i32m1(q3, d3, vl);
+						mull = __riscv_vmul_vv_i32m1(q2, d2, vl);
+						summ = __riscv_vadd_vv_i32m1(summ, mull, vl);
+						mull = __riscv_vmul_vv_i32m1(q1, d1, vl);
+						summ = __riscv_vadd_vv_i32m1(summ, mull, vl);
+						mull = __riscv_vmul_vv_i32m1(q0, d0, vl);
+						summ = __riscv_vadd_vv_i32m1(summ, mull, vl);
+						summ = __riscv_vsra_vx_i32m1(summ, lp_quantization, vl);
+
+						data_vec = __riscv_vle32_v_i32m1(data + i, vl);
+						data_vec = __riscv_vsub_vv_i32m1(data_vec, summ, vl);
+						__riscv_vse32_v_i32m1(residual + i, data_vec, vl);
+					}
+				}
+				else { /* order == 3 */
+					q0 = __riscv_vmv_v_x_i32m1(qlp_coeff[0], vl);
+					q1 = __riscv_vmv_v_x_i32m1(qlp_coeff[1], vl);
+					q2 = __riscv_vmv_v_x_i32m1(qlp_coeff[2], vl);
+
+					for(i = 0; i < (int)data_len - 3; i += 4) {
+						d0 = __riscv_vle32_v_i32m1(data + i - 1, vl);
+						d1 = __riscv_vle32_v_i32m1(data + i - 2, vl);
+						d2 = __riscv_vle32_v_i32m1(data + i - 3, vl);
+
+						summ = __riscv_vmul_vv_i32m1(q2, d2, vl);
+						mull = __riscv_vmul_vv_i32m1(q1, d1, vl);
+						summ = __riscv_vadd_vv_i32m1(summ, mull, vl);
+						mull = __riscv_vmul_vv_i32m1(q0, d0, vl);
+						summ = __riscv_vadd_vv_i32m1(summ, mull, vl);
+						summ = __riscv_vsra_vx_i32m1(summ, lp_quantization, vl);
+
+						data_vec = __riscv_vle32_v_i32m1(data + i, vl);
+						data_vec = __riscv_vsub_vv_i32m1(data_vec, summ, vl);
+						__riscv_vse32_v_i32m1(residual + i, data_vec, vl);
+					}
+				}
+			}
+			else {
+				if(order == 2) {
+					q0 = __riscv_vmv_v_x_i32m1(qlp_coeff[0], vl);
+					q1 = __riscv_vmv_v_x_i32m1(qlp_coeff[1], vl);
+
+					for(i = 0; i < (int)data_len - 3; i += 4) {
+						d0 = __riscv_vle32_v_i32m1(data + i - 1, vl);
+						d1 = __riscv_vle32_v_i32m1(data + i - 2, vl);
+
+						summ = __riscv_vmul_vv_i32m1(q1, d1, vl);
+						mull = __riscv_vmul_vv_i32m1(q0, d0, vl);
+						summ = __riscv_vadd_vv_i32m1(summ, mull, vl);
+						summ = __riscv_vsra_vx_i32m1(summ, lp_quantization, vl);
+
+						data_vec = __riscv_vle32_v_i32m1(data + i, vl);
+						data_vec = __riscv_vsub_vv_i32m1(data_vec, summ, vl);
+						__riscv_vse32_v_i32m1(residual + i, data_vec, vl);
+					}
+				}
+				else { /* order == 1 */
+					q0 = __riscv_vmv_v_x_i32m1(qlp_coeff[0], vl);
+
+					for(i = 0; i < (int)data_len - 3; i += 4) {
+						d0 = __riscv_vle32_v_i32m1(data + i - 1, vl);
+
+						summ = __riscv_vmul_vv_i32m1(q0, d0, vl);
+						summ = __riscv_vsra_vx_i32m1(summ, lp_quantization, vl);
+
+						data_vec = __riscv_vle32_v_i32m1(data + i, vl);
+						data_vec = __riscv_vsub_vv_i32m1(data_vec, summ, vl);
+						__riscv_vse32_v_i32m1(residual + i, data_vec, vl);
+					}
+				}
+			}
+		}
+		for(; i < (int)data_len; i++) {
+			sum = 0;
+			switch(order) {
+				case 12:
+					sum += qlp_coeff[11] * data[i - 12]; /* Falls through. */
+				case 11:
+					sum += qlp_coeff[10] * data[i - 11]; /* Falls through. */
+				case 10:
+					sum += qlp_coeff[9] * data[i - 10]; /* Falls through. */
+				case 9:
+					sum += qlp_coeff[8] * data[i - 9]; /* Falls through. */
+				case 8:
+					sum += qlp_coeff[7] * data[i - 8]; /* Falls through. */
+				case 7:
+					sum += qlp_coeff[6] * data[i - 7]; /* Falls through. */
+				case 6:
+					sum += qlp_coeff[5] * data[i - 6]; /* Falls through. */
+				case 5:
+					sum += qlp_coeff[4] * data[i - 5]; /* Falls through. */
+				case 4:
+					sum += qlp_coeff[3] * data[i - 4]; /* Falls through. */
+				case 3:
+					sum += qlp_coeff[2] * data[i - 3]; /* Falls through. */
+				case 2:
+					sum += qlp_coeff[1] * data[i - 2]; /* Falls through. */
+				case 1:
+					sum += qlp_coeff[0] * data[i - 1];
+			}
+			residual[i] = data[i] - (sum >> lp_quantization);
+		}
+	}
+	else { /* order > 12 */
+		for(i = 0; i < (int)data_len; i++) {
+			sum = 0;
+			switch(order) {
+				case 32:
+					sum += qlp_coeff[31] * data[i - 32]; /* Falls through. */
+				case 31:
+					sum += qlp_coeff[30] * data[i - 31]; /* Falls through. */
+				case 30:
+					sum += qlp_coeff[29] * data[i - 30]; /* Falls through. */
+				case 29:
+					sum += qlp_coeff[28] * data[i - 29]; /* Falls through. */
+				case 28:
+					sum += qlp_coeff[27] * data[i - 28]; /* Falls through. */
+				case 27:
+					sum += qlp_coeff[26] * data[i - 27]; /* Falls through. */
+				case 26:
+					sum += qlp_coeff[25] * data[i - 26]; /* Falls through. */
+				case 25:
+					sum += qlp_coeff[24] * data[i - 25]; /* Falls through. */
+				case 24:
+					sum += qlp_coeff[23] * data[i - 24]; /* Falls through. */
+				case 23:
+					sum += qlp_coeff[22] * data[i - 23]; /* Falls through. */
+				case 22:
+					sum += qlp_coeff[21] * data[i - 22]; /* Falls through. */
+				case 21:
+					sum += qlp_coeff[20] * data[i - 21]; /* Falls through. */
+				case 20:
+					sum += qlp_coeff[19] * data[i - 20]; /* Falls through. */
+				case 19:
+					sum += qlp_coeff[18] * data[i - 19]; /* Falls through. */
+				case 18:
+					sum += qlp_coeff[17] * data[i - 18]; /* Falls through. */
+				case 17:
+					sum += qlp_coeff[16] * data[i - 17]; /* Falls through. */
+				case 16:
+					sum += qlp_coeff[15] * data[i - 16]; /* Falls through. */
+				case 15:
+					sum += qlp_coeff[14] * data[i - 15]; /* Falls through. */
+				case 14:
+					sum += qlp_coeff[13] * data[i - 14]; /* Falls through. */
+				case 13:
+					sum += qlp_coeff[12] * data[i - 13];
+					sum += qlp_coeff[11] * data[i - 12];
+					sum += qlp_coeff[10] * data[i - 11];
+					sum += qlp_coeff[9] * data[i - 10];
+					sum += qlp_coeff[8] * data[i - 9];
+					sum += qlp_coeff[7] * data[i - 8];
+					sum += qlp_coeff[6] * data[i - 7];
+					sum += qlp_coeff[5] * data[i - 6];
+					sum += qlp_coeff[4] * data[i - 5];
+					sum += qlp_coeff[3] * data[i - 4];
+					sum += qlp_coeff[2] * data[i - 3];
+					sum += qlp_coeff[1] * data[i - 2];
+					sum += qlp_coeff[0] * data[i - 1];
+			}
+			residual[i] = data[i] - (sum >> lp_quantization);
+		}
+	}
+}
+
+#endif /* FLAC__CPU_ARM64 && FLAC__HAS_ARCH64INTRIN */
+#endif /* FLAC__RISCV_VECTOR */
+#endif /* FLAC__NO_ASM */
+#endif /* FLAC__INTEGER_ONLY_LIBRARY */
diff --git a/src/libFLAC/stream_encoder.c b/src/libFLAC/stream_encoder.c
index c1c03e49ed..edba2f7f67 100644
--- a/src/libFLAC/stream_encoder.c
+++ b/src/libFLAC/stream_encoder.c
@@ -959,6 +959,19 @@ static FLAC__StreamEncoderInitStatus init_stream_internal_(
     encoder->private_->local_lpc_compute_residual_from_qlp_coefficients_64bit = FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_neon;
 #endif /* defined FLAC__CPU_ARM64 && FLAC__HAS_NEONINTRIN */
 
+#if defined FLAC__CPU_RISCV64 && FLAC__HAS_RISCVINTRIN
+#ifdef FLAC__RISCV_VECTOR
+	if(encoder->protected_->max_lpc_order <= 16)
+		encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_riscv;
+	else
+		encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation;
+
+	// encoder->private_->local_lpc_compute_residual_from_qlp_coefficients_16bit = FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_riscv;
+	encoder->private_->local_lpc_compute_residual_from_qlp_coefficients = FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_riscv;
+	// encoder->private_->local_lpc_compute_residual_from_qlp_coefficients_64bit = FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_riscv;
+#endif /* FLAC__RISCV_VECTOR */
+#endif /* defined FLAC__CPU_RISCV64 && FLAC__HAS_RISCVINTRIN */
+
 	if(encoder->private_->cpuinfo.use_asm) {
 #  ifdef FLAC__CPU_IA32
 		FLAC__ASSERT(encoder->private_->cpuinfo.type == FLAC__CPUINFO_TYPE_IA32);

From 3da085e558f3f4ccded373770e06d9bea01d1349 Mon Sep 17 00:00:00 2001
From: Gurjant Kalsi <me@gurjantkalsi.com>
Date: Thu, 10 Aug 2023 04:01:31 +0000
Subject: [PATCH 2/4] Introduce Dynamic Detection of RiscV Vector Unit

This patch dynamically detects whether the RiscV vector unit
is available and only enables the intrinsic routines if it is.

Tested by launching QEMU with the Vector Extensions enabled
and disabled and observed that the intrinsic routines were
only patched in when vector was enabled.
---
 config.cmake.h.in                 |  3 +++
 src/libFLAC/CMakeLists.txt        |  1 +
 src/libFLAC/cpu.c                 | 29 +++++++++++++++++++++++++++++
 src/libFLAC/include/private/cpu.h |  7 +++++++
 src/libFLAC/include/private/lpc.h |  2 --
 src/libFLAC/lpc_intrin_riscv.c    |  1 -
 src/libFLAC/stream_encoder.c      | 14 ++++++--------
 7 files changed, 46 insertions(+), 11 deletions(-)

diff --git a/config.cmake.h.in b/config.cmake.h.in
index 1de71e2144..a935ec2e84 100644
--- a/config.cmake.h.in
+++ b/config.cmake.h.in
@@ -109,6 +109,9 @@
 /* Define to 1 if you have the <sys/param.h> header file. */
 #cmakedefine HAVE_SYS_PARAM_H
 
+/* Define to 1 if you have the <sys/param.h> header file. */
+#cmakedefine HAVE_SYS_AUXV_H
+
 /* Define to 1 if you have the <sys/stat.h> header file. */
 #cmakedefine HAVE_SYS_STAT_H
 
diff --git a/src/libFLAC/CMakeLists.txt b/src/libFLAC/CMakeLists.txt
index 3894649b56..2d23132a82 100644
--- a/src/libFLAC/CMakeLists.txt
+++ b/src/libFLAC/CMakeLists.txt
@@ -3,6 +3,7 @@ option(RISCV_VECTOR "Use RiscV Vector Optimization" OFF)
 
 check_include_file("cpuid.h" HAVE_CPUID_H)
 check_include_file("sys/param.h" HAVE_SYS_PARAM_H)
+check_include_file("sys/auxv.h" HAVE_SYS_AUXV_H)
 
 set(CMAKE_REQUIRED_LIBRARIES m)
 check_function_exists(lround HAVE_LROUND)
diff --git a/src/libFLAC/cpu.c b/src/libFLAC/cpu.c
index d088e3c0b7..2ba4921ac8 100644
--- a/src/libFLAC/cpu.c
+++ b/src/libFLAC/cpu.c
@@ -57,6 +57,10 @@
 #include <sys/auxv.h>
 #endif
 
+#if defined(FLAC__RISCV_VECTOR) && defined(FLAC__HAS_RISCVINTRIN)
+#include <riscv_vector.h>
+#endif
+
 #if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN && !defined FLAC__NO_ASM
 
 /* these are flags in EDX of CPUID AX=00000001 */
@@ -231,6 +235,26 @@ x86_cpu_info (FLAC__CPUInfo *info)
 #endif
 }
 
+static void
+rv64_cpu_info(FLAC__CPUInfo *info)
+{
+#if defined(FLAC__CPU_RISCV64) && defined(FLAC__HAS_RISCVINTRIN) && defined(FLAC__HAS_RISCVINTRIN) && !defined(FLAC__NO_ASM) && defined(HAVE_SYS_AUXV_H)
+#define ISA_V_HWCAP (1 << ('v' - 'a'))
+	// Check that the kernel and the hardware support RiscV Vector.
+	unsigned long hw_cap = getauxval(AT_HWCAP);
+	info->rv64.has_vector = (hw_cap & ISA_V_HWCAP) == ISA_V_HWCAP;
+	if(info->rv64.has_vector) {
+		info->rv64.vlenb = __riscv_vsetvlmax_e8m1();
+	}
+	else {
+		info->rv64.vlenb = 0;
+	}
+#else
+	info->rv64.has_vector = false;
+	info->rv64.vlenb = 0;
+#endif
+}
+
 void FLAC__cpu_info (FLAC__CPUInfo *info)
 {
 	memset(info, 0, sizeof(*info));
@@ -239,6 +263,8 @@ void FLAC__cpu_info (FLAC__CPUInfo *info)
 	info->type = FLAC__CPUINFO_TYPE_IA32;
 #elif defined FLAC__CPU_X86_64
 	info->type = FLAC__CPUINFO_TYPE_X86_64;
+#elif defined FLAC__CPU_RISCV64
+	info->type = FLAC__CPUINFO_TYPE_RISCV_64;
 #else
 	info->type = FLAC__CPUINFO_TYPE_UNKNOWN;
 #endif
@@ -248,6 +274,9 @@ void FLAC__cpu_info (FLAC__CPUInfo *info)
 	case FLAC__CPUINFO_TYPE_X86_64:
 		x86_cpu_info (info);
 		break;
+	case FLAC__CPUINFO_TYPE_RISCV_64:
+		rv64_cpu_info(info);
+		break;
 	default:
 		info->use_asm = false;
 		break;
diff --git a/src/libFLAC/include/private/cpu.h b/src/libFLAC/include/private/cpu.h
index 8843c74bfe..c961777f85 100644
--- a/src/libFLAC/include/private/cpu.h
+++ b/src/libFLAC/include/private/cpu.h
@@ -162,6 +162,7 @@
 typedef enum {
 	FLAC__CPUINFO_TYPE_IA32,
 	FLAC__CPUINFO_TYPE_X86_64,
+	FLAC__CPUINFO_TYPE_RISCV_64,
 	FLAC__CPUINFO_TYPE_UNKNOWN
 } FLAC__CPUInfo_Type;
 
@@ -183,10 +184,16 @@ typedef struct {
 	FLAC__bool bmi2;
 } FLAC__CPUInfo_x86;
 
+typedef struct {
+	FLAC__bool has_vector;
+	FLAC__uint32 vlenb;     // Vector register length in bytes if CPU supports it.
+} FLAC__CPUInfo_RV64;
+
 typedef struct {
 	FLAC__bool use_asm;
 	FLAC__CPUInfo_Type type;
 	FLAC__CPUInfo_x86 x86;
+	FLAC__CPUInfo_RV64 rv64;
 } FLAC__CPUInfo;
 
 void FLAC__cpu_info(FLAC__CPUInfo *info);
diff --git a/src/libFLAC/include/private/lpc.h b/src/libFLAC/include/private/lpc.h
index 93051fdeb2..cd934e579f 100644
--- a/src/libFLAC/include/private/lpc.h
+++ b/src/libFLAC/include/private/lpc.h
@@ -166,9 +166,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_neon(const FLA
 #   endif
 
 #ifdef FLAC__CPU_RISCV64
-// void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_riscv(const FLAC__int32 *data, uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 residual[]);
 void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_riscv(const FLAC__int32 *data, uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 residual[]);
-// void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_riscv(const FLAC__int32 *data, uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 residual[]);
 #endif
 
 #  if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN
diff --git a/src/libFLAC/lpc_intrin_riscv.c b/src/libFLAC/lpc_intrin_riscv.c
index a1fad98060..144f492edc 100644
--- a/src/libFLAC/lpc_intrin_riscv.c
+++ b/src/libFLAC/lpc_intrin_riscv.c
@@ -41,7 +41,6 @@
 #include "FLAC/format.h"
 #include "private/macros.h"
 #include <riscv_vector.h>
-#include <assert.h>
 
 void FLAC__lpc_compute_autocorrelation_intrin_riscv(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[])
 {
diff --git a/src/libFLAC/stream_encoder.c b/src/libFLAC/stream_encoder.c
index edba2f7f67..43bd6d6fd6 100644
--- a/src/libFLAC/stream_encoder.c
+++ b/src/libFLAC/stream_encoder.c
@@ -961,14 +961,12 @@ static FLAC__StreamEncoderInitStatus init_stream_internal_(
 
 #if defined FLAC__CPU_RISCV64 && FLAC__HAS_RISCVINTRIN
 #ifdef FLAC__RISCV_VECTOR
-	if(encoder->protected_->max_lpc_order <= 16)
-		encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_riscv;
-	else
-		encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation;
-
-	// encoder->private_->local_lpc_compute_residual_from_qlp_coefficients_16bit = FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_riscv;
-	encoder->private_->local_lpc_compute_residual_from_qlp_coefficients = FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_riscv;
-	// encoder->private_->local_lpc_compute_residual_from_qlp_coefficients_64bit = FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_riscv;
+	if(encoder->private_->cpuinfo.rv64.has_vector) {
+		if(encoder->protected_->max_lpc_order <= encoder->private_->cpuinfo.rv64.vlenb) {
+			encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_riscv;
+		}
+		encoder->private_->local_lpc_compute_residual_from_qlp_coefficients = FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_riscv;
+	}
 #endif /* FLAC__RISCV_VECTOR */
 #endif /* defined FLAC__CPU_RISCV64 && FLAC__HAS_RISCVINTRIN */
 

From 3f70de0cb738aab68d71d08c988a897ab78a4fe4 Mon Sep 17 00:00:00 2001
From: Gurjant Kalsi <me@gurjantkalsi.com>
Date: Wed, 16 Aug 2023 16:43:56 +0000
Subject: [PATCH 3/4] Test for riscv_vector before calling
 __riscv_vsetvlmax_e8m1

This was causing a build failure if riscv_vector was not available
on the system.
---
 src/libFLAC/cpu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/libFLAC/cpu.c b/src/libFLAC/cpu.c
index 2ba4921ac8..a5e5ea8ad9 100644
--- a/src/libFLAC/cpu.c
+++ b/src/libFLAC/cpu.c
@@ -238,7 +238,7 @@ x86_cpu_info (FLAC__CPUInfo *info)
 static void
 rv64_cpu_info(FLAC__CPUInfo *info)
 {
-#if defined(FLAC__CPU_RISCV64) && defined(FLAC__HAS_RISCVINTRIN) && defined(FLAC__HAS_RISCVINTRIN) && !defined(FLAC__NO_ASM) && defined(HAVE_SYS_AUXV_H)
+#if defined(FLAC__CPU_RISCV64) && defined(FLAC__HAS_RISCVINTRIN) && defined(FLAC__HAS_RISCVINTRIN) && !defined(FLAC__NO_ASM) && defined(HAVE_SYS_AUXV_H) && defined(FLAC__RISCV_VECTOR)
 #define ISA_V_HWCAP (1 << ('v' - 'a'))
 	// Check that the kernel and the hardware support RiscV Vector.
 	unsigned long hw_cap = getauxval(AT_HWCAP);

From b9e0a83863eac30cab5055c18acea1cc87271551 Mon Sep 17 00:00:00 2001
From: Gurjant Kalsi <me@gurjantkalsi.com>
Date: Thu, 17 Aug 2023 01:12:25 +0000
Subject: [PATCH 4/4] Correctly detect riscv_vector.h using "-march=rv64gcv"

When detecting "riscv_vector.h" using autotools and cmake,
invoke the toolchain with -march=rv64gcv.
---
 CMakeLists.txt                 | 8 +++++++-
 cmake/CheckRV64Vector.cmake    | 2 +-
 config.cmake.h.in              | 3 +++
 configure.ac                   | 4 +++-
 src/libFLAC/CMakeLists.txt     | 3 +++
 src/libFLAC/cpu.c              | 4 ++--
 src/libFLAC/lpc_intrin_riscv.c | 2 ++
 src/libFLAC/stream_encoder.c   | 4 ++--
 8 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ed770c5093..ee6be886ba 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -113,7 +113,13 @@ check_include_file("inttypes.h" HAVE_INTTYPES_H)
 check_include_file("stdint.h" HAVE_STDINT_H)
 check_include_file("stdbool.h" HAVE_STDBOOL_H)
 check_include_file("arm_neon.h" FLAC__HAS_NEONINTRIN)
-check_include_file("riscv_vector.h" FLAC__HAS_RISCVINTRIN)
+
+# Toolchains won't allow riscv_vector.h to be included unless the
+# vector extensions are enabled.
+set(SAVED_CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS}")
+set(CMAKE_REQUIRED_FLAGS "-march=rv64gcv")
+check_include_file("riscv_vector.h" HAVE_RISCV_VECTOR_H)
+set(CMAKE_REQUIRED_FLAGS "${SAVED_CMAKE_REQUIRED_FLAGS}")
 
 if(NOT HAVE_STDINT_H OR NOT HAVE_STDBOOL_H)
     message(SEND_ERROR "Header stdint.h and/or stdbool.h not found")
diff --git a/cmake/CheckRV64Vector.cmake b/cmake/CheckRV64Vector.cmake
index 955aab70dc..c73f65e8f0 100644
--- a/cmake/CheckRV64Vector.cmake
+++ b/cmake/CheckRV64Vector.cmake
@@ -3,7 +3,7 @@ macro(CHECK_RV64VECTOR VARIABLE)
         message(STATUS "Check whether RV64 Vector can be used")
         configure_file(${PROJECT_SOURCE_DIR}/cmake/CheckRV64Vector.c.in ${PROJECT_BINARY_DIR}/CMakeFiles/CMakeTmp/CheckRV64Vector.c @ONLY)
         try_compile(HAVE_${VARIABLE} "${PROJECT_BINARY_DIR}"
-            "${PROJECT_BINARY_DIR}/CMakeFiles/CMakeTmp/CheckRV64Vector.c")
+            "${PROJECT_BINARY_DIR}/CMakeFiles/CMakeTmp/CheckRV64Vector.c" COMPILE_DEFINITIONS "-march=rv64gcv")
         if(HAVE_${VARIABLE})
             message(STATUS "Check whether RV64 Vector can be used - yes")
             set(${VARIABLE} 1 CACHE INTERNAL "Result of CHECK_RV64VECTOR" FORCE)
diff --git a/config.cmake.h.in b/config.cmake.h.in
index a935ec2e84..53fa41d0c9 100644
--- a/config.cmake.h.in
+++ b/config.cmake.h.in
@@ -121,6 +121,9 @@
 /* Define to 1 if you have the <termios.h> header file. */
 #cmakedefine HAVE_TERMIOS_H
 
+/* Define to 1 if you have the <riscv_vector.h> header file. */
+#cmakedefine HAVE_RISCV_VECTOR_H
+
 /* Define to 1 if typeof works with your compiler. */
 #cmakedefine HAVE_TYPEOF
 
diff --git a/configure.ac b/configure.ac
index 49cfbdb252..06d9dafb68 100644
--- a/configure.ac
+++ b/configure.ac
@@ -53,7 +53,7 @@ AM_PROG_CC_C_O
 AC_C_INLINE
 AC_C_TYPEOF
 
-AC_CHECK_HEADERS([stdint.h stdbool.h inttypes.h byteswap.h sys/auxv.h sys/param.h sys/ioctl.h termios.h x86intrin.h cpuid.h arm_neon.h riscv_vector.h])
+AC_CHECK_HEADERS([stdint.h stdbool.h inttypes.h byteswap.h sys/auxv.h sys/param.h sys/ioctl.h termios.h x86intrin.h cpuid.h arm_neon.h])
 
 if test "x$ac_cv_header_stdint_h" != xyes -o "x$ac_cv_header_stdbool_h" != xyes; then
 AC_MSG_ERROR("Header stdint.h and/or stdbool.h not found")
@@ -89,6 +89,8 @@ fi
 AC_ARG_ENABLE(riscv-vector-optimizations, AS_HELP_STRING([--enable-riscv-vector-optimizations],[Enable RiscV Vector Optimization Routines]), riscv_vector_opt=yes, riscv_vector_opt=no)
 AM_CONDITIONAL(FLAC__RISCV_VECTOR, test "x$riscv_vector_opt" = xyes)
 if test "x$riscv_vector_opt" = xyes ; then
+CFLAGS="-march=rv64gcv $CFLAGS"
+AC_CHECK_HEADERS([riscv_vector.h])
 AC_DEFINE(FLAC__RISCV_VECTOR)
 AH_TEMPLATE(FLAC__RISCV_VECTOR, [define to enable use riscv vector extensions])
 fi
diff --git a/src/libFLAC/CMakeLists.txt b/src/libFLAC/CMakeLists.txt
index 2d23132a82..2e5fe52cdc 100644
--- a/src/libFLAC/CMakeLists.txt
+++ b/src/libFLAC/CMakeLists.txt
@@ -33,6 +33,9 @@ else()
     check_cpu_arch_riscv64(FLAC__CPU_RISCV64)
     if(FLAC__CPU_RISCV64)
          check_rv64vector(FLAC__HAS_RISCVINTRIN)
+         if (RISCV_VECTOR AND FLAC__HAS_RISCVINTRIN)
+            set_property(SOURCE lpc_intrin_riscv.c cpu.c APPEND_STRING PROPERTY COMPILE_FLAGS " -march=rv64gcv ")
+         endif()
     endif()
 endif()
 
diff --git a/src/libFLAC/cpu.c b/src/libFLAC/cpu.c
index a5e5ea8ad9..b381a55867 100644
--- a/src/libFLAC/cpu.c
+++ b/src/libFLAC/cpu.c
@@ -57,7 +57,7 @@
 #include <sys/auxv.h>
 #endif
 
-#if defined(FLAC__RISCV_VECTOR) && defined(FLAC__HAS_RISCVINTRIN)
+#if defined(HAVE_RISCV_VECTOR_H) && defined(FLAC__RISCV_VECTOR) && defined(FLAC__HAS_RISCVINTRIN)
 #include <riscv_vector.h>
 #endif
 
@@ -238,7 +238,7 @@ x86_cpu_info (FLAC__CPUInfo *info)
 static void
 rv64_cpu_info(FLAC__CPUInfo *info)
 {
-#if defined(FLAC__CPU_RISCV64) && defined(FLAC__HAS_RISCVINTRIN) && defined(FLAC__HAS_RISCVINTRIN) && !defined(FLAC__NO_ASM) && defined(HAVE_SYS_AUXV_H) && defined(FLAC__RISCV_VECTOR)
+#if defined(FLAC__CPU_RISCV64) && defined(FLAC__HAS_RISCVINTRIN) && !defined(FLAC__NO_ASM) && defined(HAVE_SYS_AUXV_H) && defined(FLAC__RISCV_VECTOR) && defined(HAVE_RISCV_VECTOR_H)
 #define ISA_V_HWCAP (1 << ('v' - 'a'))
 	// Check that the kernel and the hardware support RiscV Vector.
 	unsigned long hw_cap = getauxval(AT_HWCAP);
diff --git a/src/libFLAC/lpc_intrin_riscv.c b/src/libFLAC/lpc_intrin_riscv.c
index 144f492edc..bd180cadb3 100644
--- a/src/libFLAC/lpc_intrin_riscv.c
+++ b/src/libFLAC/lpc_intrin_riscv.c
@@ -35,6 +35,7 @@
 #ifndef FLAC__INTEGER_ONLY_LIBRARY
 #ifndef FLAC__NO_ASM
 #ifdef FLAC__RISCV_VECTOR
+#ifdef HAVE_RISCV_VECTOR_H
 #if defined FLAC__CPU_RISCV64 && FLAC__HAS_RISCVINTRIN
 #include "private/lpc.h"
 #include "FLAC/assert.h"
@@ -627,6 +628,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_riscv(const FLAC__i
 }
 
 #endif /* FLAC__CPU_ARM64 && FLAC__HAS_ARCH64INTRIN */
+#endif /* HAVE_RISCV_VECTOR_H */
 #endif /* FLAC__RISCV_VECTOR */
 #endif /* FLAC__NO_ASM */
 #endif /* FLAC__INTEGER_ONLY_LIBRARY */
diff --git a/src/libFLAC/stream_encoder.c b/src/libFLAC/stream_encoder.c
index 43bd6d6fd6..89e682ce01 100644
--- a/src/libFLAC/stream_encoder.c
+++ b/src/libFLAC/stream_encoder.c
@@ -959,7 +959,7 @@ static FLAC__StreamEncoderInitStatus init_stream_internal_(
     encoder->private_->local_lpc_compute_residual_from_qlp_coefficients_64bit = FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_neon;
 #endif /* defined FLAC__CPU_ARM64 && FLAC__HAS_NEONINTRIN */
 
-#if defined FLAC__CPU_RISCV64 && FLAC__HAS_RISCVINTRIN
+#if defined FLAC__CPU_RISCV64 && defined FLAC__HAS_RISCVINTRIN && defined HAVE_RISCV_VECTOR_H
 #ifdef FLAC__RISCV_VECTOR
 	if(encoder->private_->cpuinfo.rv64.has_vector) {
 		if(encoder->protected_->max_lpc_order <= encoder->private_->cpuinfo.rv64.vlenb) {
@@ -968,7 +968,7 @@ static FLAC__StreamEncoderInitStatus init_stream_internal_(
 		encoder->private_->local_lpc_compute_residual_from_qlp_coefficients = FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_riscv;
 	}
 #endif /* FLAC__RISCV_VECTOR */
-#endif /* defined FLAC__CPU_RISCV64 && FLAC__HAS_RISCVINTRIN */
+#endif /* defined FLAC__CPU_RISCV64 && FLAC__HAS_RISCVINTRIN && HAVE_RISCV_VECTOR_H */
 
 	if(encoder->private_->cpuinfo.use_asm) {
 #  ifdef FLAC__CPU_IA32