From 5b521e6ccc514cb85ef579b167ef1bf979b66872 Mon Sep 17 00:00:00 2001
From: thijssteel <thijs.steel@kuleuven.be>
Date: Thu, 29 Feb 2024 16:04:38 +0100
Subject: [PATCH 01/10] copy over files from lapackv4 repo

---
 LAPACK_CPP/Makefile                           |  43 +
 LAPACK_CPP/example/.gitignore                 |   2 +
 LAPACK_CPP/example/profile_lasr3.cpp          |  99 +++
 LAPACK_CPP/example/test_lasr3.cpp             | 115 +++
 LAPACK_CPP/include/lapack_c.h                 |  13 +
 LAPACK_CPP/include/lapack_c/lartg.h           |  22 +
 LAPACK_CPP/include/lapack_c/lasr3.h           |  94 ++
 LAPACK_CPP/include/lapack_c/rot.h             |  46 +
 LAPACK_CPP/include/lapack_c/util.h            |  19 +
 LAPACK_CPP/include/lapack_cpp.hpp             |   9 +
 LAPACK_CPP/include/lapack_cpp/base.hpp        |  10 +
 LAPACK_CPP/include/lapack_cpp/base/enums.hpp  |  84 ++
 LAPACK_CPP/include/lapack_cpp/base/matrix.hpp | 334 +++++++
 .../include/lapack_cpp/base/memory_block.hpp  | 136 +++
 LAPACK_CPP/include/lapack_cpp/base/types.hpp  |  33 +
 LAPACK_CPP/include/lapack_cpp/base/vector.hpp | 203 +++++
 LAPACK_CPP/include/lapack_cpp/blas.hpp        |   6 +
 LAPACK_CPP/include/lapack_cpp/blas/rot.hpp    |  24 +
 LAPACK_CPP/include/lapack_cpp/lapack.hpp      |   7 +
 .../include/lapack_cpp/lapack/lartg.hpp       |  16 +
 .../include/lapack_cpp/lapack/lasr3.hpp       | 815 ++++++++++++++++++
 LAPACK_CPP/include/lapack_cpp/utils.hpp       |  87 ++
 LAPACK_CPP/src/lapack_c/lartg_c.f90           |  53 ++
 LAPACK_CPP/src/lapack_c/lasr3_c.cpp           | 206 +++++
 LAPACK_CPP/src/lapack_c/rot_c.f90             |  57 ++
 LAPACK_CPP/src/lapack_cpp/lartg_cpp.cpp       |  43 +
 LAPACK_CPP/src/lapack_cpp/rot_cpp.cpp         |  67 ++
 27 files changed, 2643 insertions(+)
 create mode 100644 LAPACK_CPP/Makefile
 create mode 100644 LAPACK_CPP/example/.gitignore
 create mode 100644 LAPACK_CPP/example/profile_lasr3.cpp
 create mode 100644 LAPACK_CPP/example/test_lasr3.cpp
 create mode 100644 LAPACK_CPP/include/lapack_c.h
 create mode 100644 LAPACK_CPP/include/lapack_c/lartg.h
 create mode 100644 LAPACK_CPP/include/lapack_c/lasr3.h
 create mode 100644 LAPACK_CPP/include/lapack_c/rot.h
 create mode 100644 LAPACK_CPP/include/lapack_c/util.h
 create mode 100644 LAPACK_CPP/include/lapack_cpp.hpp
 create mode 100644 LAPACK_CPP/include/lapack_cpp/base.hpp
 create mode 100644 LAPACK_CPP/include/lapack_cpp/base/enums.hpp
 create mode 100644 LAPACK_CPP/include/lapack_cpp/base/matrix.hpp
 create mode 100644 LAPACK_CPP/include/lapack_cpp/base/memory_block.hpp
 create mode 100644 LAPACK_CPP/include/lapack_cpp/base/types.hpp
 create mode 100644 LAPACK_CPP/include/lapack_cpp/base/vector.hpp
 create mode 100644 LAPACK_CPP/include/lapack_cpp/blas.hpp
 create mode 100644 LAPACK_CPP/include/lapack_cpp/blas/rot.hpp
 create mode 100644 LAPACK_CPP/include/lapack_cpp/lapack.hpp
 create mode 100644 LAPACK_CPP/include/lapack_cpp/lapack/lartg.hpp
 create mode 100644 LAPACK_CPP/include/lapack_cpp/lapack/lasr3.hpp
 create mode 100644 LAPACK_CPP/include/lapack_cpp/utils.hpp
 create mode 100644 LAPACK_CPP/src/lapack_c/lartg_c.f90
 create mode 100644 LAPACK_CPP/src/lapack_c/lasr3_c.cpp
 create mode 100644 LAPACK_CPP/src/lapack_c/rot_c.f90
 create mode 100644 LAPACK_CPP/src/lapack_cpp/lartg_cpp.cpp
 create mode 100644 LAPACK_CPP/src/lapack_cpp/rot_cpp.cpp

diff --git a/LAPACK_CPP/Makefile b/LAPACK_CPP/Makefile
new file mode 100644
index 000000000..6d0d8eb0c
--- /dev/null
+++ b/LAPACK_CPP/Makefile
@@ -0,0 +1,43 @@
+CXX=g++
+# CXXFLAGS=-std=c++17 -Wall -Wextra -pedantic -g -Og
+CXXFLAGS=-std=c++17 -O3 -march=native -ffast-math -g -DNDEBUG
+
+FC=gfortran
+FFLAGS=-Wall -Wextra -pedantic -g
+
+all: ./example/test_lasr3 ./example/profile_lasr3
+
+HEADERS = $(wildcard lapack_c/include/*.h) \
+		  $(wildcard lapack_c/include/*/*.h) \
+		  $(wildcard lapack_c/include/*/*/*.h) \
+		  $(wildcard lapack_cpp/include/*.hpp) \
+		  $(wildcard lapack_cpp/include/*/*.hpp) \
+		  $(wildcard lapack_cpp/include/*/*/*.hpp)
+
+OBJFILES = src/lapack_c/rot_c.o \
+		   src/lapack_c/lartg_c.o \
+		   src/lapack_c/lasr3_c.o \
+		   src/lapack_cpp/rot_cpp.o \
+		   src/lapack_cpp/lartg_cpp.o \
+
+# C files
+
+src/lapack_c/%.o: src/lapack_c/%.cpp
+	$(CXX) $(CXXFLAGS) -I./include -c -o $@ $<
+
+src/lapack_c/%.o: src/lapack_c/%.f90
+	$(FC) $(FFLAGS) -c -o $@ $<
+
+# C++ files
+src/lapack_cpp/%.o: src/lapack_cpp/%.cpp $(HEADERS)
+	$(CXX) $(CXXFLAGS) -I./include -c -o $@ $<
+
+# Test files
+example/test_lasr3: example/test_lasr3.cpp $(OBJFILES) $(HEADERS)
+	$(CXX) $(CXXFLAGS) -o $@ $< $(OBJFILES) -I./include -lstdc++ -lgfortran -lblas -llapack
+
+example/profile_lasr3: example/profile_lasr3.cpp $(OBJFILES) $(HEADERS)
+	$(CXX) $(CXXFLAGS) -o $@ $< $(OBJFILES) -I./include -lstdc++ -lgfortran -lblas -llapack
+
+clean:
+	find . -type f -name '*.o' -delete
\ No newline at end of file
diff --git a/LAPACK_CPP/example/.gitignore b/LAPACK_CPP/example/.gitignore
new file mode 100644
index 000000000..ed20f170e
--- /dev/null
+++ b/LAPACK_CPP/example/.gitignore
@@ -0,0 +1,2 @@
+test_lasr3
+profile_lasr3
\ No newline at end of file
diff --git a/LAPACK_CPP/example/profile_lasr3.cpp b/LAPACK_CPP/example/profile_lasr3.cpp
new file mode 100644
index 000000000..456af8df8
--- /dev/null
+++ b/LAPACK_CPP/example/profile_lasr3.cpp
@@ -0,0 +1,99 @@
+
+#include <chrono>  // for high_resolution_clock
+#include <complex>
+#include <iostream>
+
+#include "lapack_cpp.hpp"
+
+using namespace lapack_cpp;
+
+template <typename T,
+          Layout layout = Layout::ColMajor,
+          typename idx_t = lapack_idx_t>
+void profile_lasr3(idx_t m, idx_t n, idx_t k, Side side, Direction direction)
+{
+    idx_t n_rot = side == Side::Left ? m - 1 : n - 1;
+
+    MemoryBlock<T, idx_t> A_(m, n, layout);
+    Matrix<T, layout, idx_t> A(m, n, A_);
+
+    MemoryBlock<real_t<T>, idx_t> C_(n_rot, k, layout);
+    Matrix<real_t<T>, layout, idx_t> C(n_rot, k, C_);
+
+    MemoryBlock<T, idx_t> S_(n_rot, k, layout);
+    Matrix<T, layout, idx_t> S(n_rot, k, S_);
+
+    randomize(A);
+
+    // Generate random, but valid rotations
+    randomize(C);
+    randomize(S);
+    for (idx_t i = 0; i < n_rot; ++i) {
+        for (idx_t j = 0; j < k; ++j) {
+            T f = C(i, j);
+            T g = S(i, j);
+            T r;
+            lartg(f, g, C(i, j), S(i, j), r);
+        }
+    }
+
+    MemoryBlock<T, idx_t> A_copy_(m, n, layout);
+    Matrix<T, layout, idx_t> A_copy(m, n, A_copy_);
+
+    A_copy = A;
+
+    MemoryBlock<T, idx_t> work(
+        lasr3_workquery(side, direction, C.as_const(), S.as_const(), A));
+
+    const idx_t n_timings = 100;
+    const idx_t n_warmup = 20;
+    std::vector<float> timings(n_timings);
+
+    for (idx_t i = 0; i < n_timings; ++i) {
+        A = A_copy;
+        auto start = std::chrono::high_resolution_clock::now();
+        lasr3(side, direction, C.as_const(), S.as_const(), A, work);
+        auto end = std::chrono::high_resolution_clock::now();
+        timings[i] = std::chrono::duration<float>(end - start).count();
+    }
+
+    float mean = 0;
+    for (idx_t i = n_warmup; i < n_timings; ++i)
+        mean += timings[i];
+    mean /= n_timings - n_warmup;
+
+    long nflops =
+        side == Side::Left ? 6. * (m - 1) * n * k : 6. * (n - 1) * m * k;
+
+    std::cout << "m = " << m << ", n = " << n << ", k = " << k
+              << ", side = " << (char)side
+              << ", direction = " << (char)direction << ", mean time = " << mean
+              << " s"
+              << ", flop rate = " << nflops / mean * 1.0e-9 << " GFlops"
+              << std::endl;
+}
+
+int main()
+{
+    typedef lapack_idx_t idx_t;
+    typedef double T;
+
+    const idx_t nb = 1000;
+    const idx_t k = 64;
+
+    for (int s = 0; s < 2; ++s) {
+        Side side = s == 0 ? Side::Left : Side::Right;
+        for (int d = 0; d < 2; ++d) {
+            Direction direction =
+                d == 0 ? Direction::Forward : Direction::Backward;
+            for (idx_t n_rot = 99; n_rot <= 1600; n_rot += 100) {
+                idx_t m = side == Side::Left ? n_rot + 1 : nb;
+                idx_t n = side == Side::Left ? nb : n_rot + 1;
+
+                profile_lasr3<T, Layout::ColMajor, lapack_idx_t>(m, n, k, side,
+                                                                 direction);
+            }
+        }
+    }
+    return 0;
+}
\ No newline at end of file
diff --git a/LAPACK_CPP/example/test_lasr3.cpp b/LAPACK_CPP/example/test_lasr3.cpp
new file mode 100644
index 000000000..98f5d29a5
--- /dev/null
+++ b/LAPACK_CPP/example/test_lasr3.cpp
@@ -0,0 +1,115 @@
+
+#include <complex>
+#include <iostream>
+
+#include "lapack_cpp.hpp"
+
+using namespace lapack_cpp;
+
+template <typename T,
+          Layout layout = Layout::ColMajor,
+          typename idx_t = lapack_idx_t>
+void test_lasr3(idx_t m, idx_t n, idx_t k, Side side, Direction direction)
+{
+    idx_t n_rot = side == Side::Left ? m - 1 : n - 1;
+
+    MemoryBlock<T, idx_t> A_(m, n, layout);
+    Matrix<T, layout, idx_t> A(m, n, A_);
+
+    MemoryBlock<real_t<T>, idx_t> C_(n_rot, k, layout);
+    Matrix<real_t<T>, layout, idx_t> C(n_rot, k, C_);
+
+    MemoryBlock<T, idx_t> S_(n_rot, k, layout);
+    Matrix<T, layout, idx_t> S(n_rot, k, S_);
+
+    randomize(A);
+
+    // Generate random, but valid rotations
+    randomize(C);
+    randomize(S);
+    for (idx_t i = 0; i < n_rot; ++i) {
+        for (idx_t j = 0; j < k; ++j) {
+            T f = C(i, j);
+            T g = S(i, j);
+            T r;
+            lartg(f, g, C(i, j), S(i, j), r);
+        }
+    }
+
+    MemoryBlock<T, idx_t> A_copy_(m, n, layout);
+    Matrix<T, layout, idx_t> A_copy(m, n, A_copy_);
+
+    A_copy = A;
+
+    // Apply rotations using simple loops as test
+    if (side == Side::Left) {
+        if (direction == Direction::Forward) {
+            for (idx_t i = 0; i < k; ++i)
+                for (idx_t j = 0; j < n_rot; ++j)
+                    rot(A_copy.row(j), A_copy.row(j + 1), C(j, i), S(j, i));
+        }
+        else {
+            for (idx_t i = 0; i < k; ++i)
+                for (idx_t j = n_rot - 1; j >= 0; --j)
+                    rot(A_copy.row(j), A_copy.row(j + 1), C(j, i), S(j, i));
+        }
+    }
+    else {
+        if (direction == Direction::Forward) {
+            for (idx_t i = 0; i < k; ++i)
+                for (idx_t j = 0; j < n_rot; ++j)
+                    rot(A_copy.column(j), A_copy.column(j + 1), C(j, i),
+                        conj(S(j, i)));
+        }
+        else {
+            for (idx_t i = 0; i < k; ++i)
+                for (idx_t j = n_rot - 1; j >= 0; --j)
+                    rot(A_copy.column(j), A_copy.column(j + 1), C(j, i),
+                        conj(S(j, i)));
+        }
+    }
+
+    lasr3(side, direction, C.as_const(), S.as_const(), A);
+
+    // Check that the result is the same
+    for (idx_t i = 0; i < m; ++i)
+        for (idx_t j = 0; j < n; ++j)
+            A_copy(i, j) -= A(i, j);
+
+    real_t<T> err = 0.;
+    for (idx_t i = 0; i < m; ++i)
+        for (idx_t j = 0; j < n; ++j)
+            err = std::max(err, abs(A_copy(i, j)));
+
+    // This test should really be relative
+    if( err > 1e-5 ){
+        std::cout << "Failed test_lasr3 with parameters: " << m << ", " << n << ", " << k << ", " << (char)side << ", " << (char)direction << std::endl;
+        std::cout << "Error: " << err << std::endl;
+    } else {
+        std::cout << "Passed test_lasr3 with parameters: " << m << ", " << n << ", " << k << ", " << (char)side << ", " << (char)direction << std::endl;
+    }
+
+    // print(A_copy);
+}
+
+int main()
+{
+    typedef lapack_idx_t idx_t;
+    typedef std::complex<float> T;
+
+    for (idx_t nb = 64; nb <= 2000; nb *= 2) {
+        for (idx_t n_rot = 2; n_rot <= 2000; n_rot*=2) {
+            for (idx_t k = 32; k <= 128; k+=32) {
+                test_lasr3<T, Layout::ColMajor, lapack_idx_t>(
+                    n_rot + 1, nb, k, Side::Left, Direction::Forward);
+                test_lasr3<T, Layout::ColMajor, lapack_idx_t>(
+                    nb, n_rot + 1, k, Side::Right, Direction::Forward);
+                test_lasr3<T, Layout::ColMajor, lapack_idx_t>(
+                    n_rot + 1, nb, k, Side::Left, Direction::Backward);
+                test_lasr3<T, Layout::ColMajor, lapack_idx_t>(
+                    nb, n_rot + 1, k, Side::Right, Direction::Backward);
+            }
+        }
+    }
+    return 0;
+}
\ No newline at end of file
diff --git a/LAPACK_CPP/include/lapack_c.h b/LAPACK_CPP/include/lapack_c.h
new file mode 100644
index 000000000..4212a73f4
--- /dev/null
+++ b/LAPACK_CPP/include/lapack_c.h
@@ -0,0 +1,13 @@
+#ifndef LAPACK_C_HPP
+#define LAPACK_C_HPP
+
+#include "lapack_c/util.h"
+
+// BLAS
+#include "lapack_c/rot.h"
+
+// LAPACK
+#include "lapack_c/lartg.h"
+
+
+#endif
\ No newline at end of file
diff --git a/LAPACK_CPP/include/lapack_c/lartg.h b/LAPACK_CPP/include/lapack_c/lartg.h
new file mode 100644
index 000000000..9b23aa60d
--- /dev/null
+++ b/LAPACK_CPP/include/lapack_c/lartg.h
@@ -0,0 +1,22 @@
+#ifndef LAPACK_C_LARTG_HPP
+#define LAPACK_C_LARTG_HPP
+
+#include "lapack_c/util.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+void lapack_c_slartg(float f, float g, float* cs, float* sn, float* r);
+
+void lapack_c_dlartg(double f, double g, double* cs, double* sn, double* r);
+
+void lapack_c_clartg(lapack_float_complex f, lapack_float_complex g, float* cs, lapack_float_complex* sn, lapack_float_complex* r);
+
+void lapack_c_zlartg(lapack_double_complex f, lapack_double_complex g, double* cs, lapack_double_complex* sn, lapack_double_complex* r);
+
+#ifdef __cplusplus
+}
+#endif  // __cplusplus
+
+#endif
\ No newline at end of file
diff --git a/LAPACK_CPP/include/lapack_c/lasr3.h b/LAPACK_CPP/include/lapack_c/lasr3.h
new file mode 100644
index 000000000..efd86a916
--- /dev/null
+++ b/LAPACK_CPP/include/lapack_c/lasr3.h
@@ -0,0 +1,94 @@
+#ifndef LAPACK_C_ROT_HPP
+#define LAPACK_C_ROT_HPP
+
+#include "lapack_c/util.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+void lapack_c_slasr3(char side,
+                     char direct,
+                     lapack_idx m,
+                     lapack_idx n,
+                     lapack_idx k,
+                     const float* C,
+                     lapack_idx ldc,
+                     const float* S,
+                     lapack_idx lds,
+                     float* A,
+                     lapack_idx lda,
+                     float* work,
+                     lapack_idx lwork);
+
+void lapack_c_dlasr3(char side,
+                     char direct,
+                     lapack_idx m,
+                     lapack_idx n,
+                     lapack_idx k,
+                     const double* C,
+                     lapack_idx ldc,
+                     const double* S,
+                     lapack_idx lds,
+                     double* A,
+                     lapack_idx lda,
+                     double* work,
+                     lapack_idx lwork);
+
+void lapack_c_clasr3(char side,
+                     char direct,
+                     lapack_idx m,
+                     lapack_idx n,
+                     lapack_idx k,
+                     const float* C,
+                     lapack_idx ldc,
+                     const lapack_float_complex* S,
+                     lapack_idx lds,
+                     lapack_float_complex* A,
+                     lapack_idx lda,
+                     lapack_float_complex* work,
+                     lapack_idx lwork);
+
+void lapack_c_sclasr3(char side,
+                      char direct,
+                      lapack_idx m,
+                      lapack_idx n,
+                      lapack_idx k,
+                      const float* C,
+                      lapack_idx ldc,
+                      const float* S,
+                      lapack_idx lds,
+                      lapack_float_complex* A,
+                      lapack_idx lda,
+                      lapack_float_complex* work,
+                      lapack_idx lwork);
+
+void lapack_c_zlasr3(lapack_idx m,
+                     lapack_idx n,
+                     lapack_idx k,
+                     const double* C,
+                     lapack_idx ldc,
+                     const lapack_double_complex* S,
+                     lapack_idx lds,
+                     lapack_double_complex* A,
+                     lapack_idx lda,
+                     lapack_double_complex* work,
+                     lapack_idx lwork);
+
+void lapack_c_dzlasr3(lapack_idx m,
+                      lapack_idx n,
+                      lapack_idx k,
+                      const double* C,
+                      lapack_idx ldc,
+                      const double* S,
+                      lapack_idx lds,
+                      lapack_double_complex* A,
+                      lapack_idx lda,
+                      lapack_double_complex* work,
+                      lapack_idx lwork);
+
+#ifdef __cplusplus
+}
+#endif  // __cplusplus
+
+#endif
\ No newline at end of file
diff --git a/LAPACK_CPP/include/lapack_c/rot.h b/LAPACK_CPP/include/lapack_c/rot.h
new file mode 100644
index 000000000..0c180a76a
--- /dev/null
+++ b/LAPACK_CPP/include/lapack_c/rot.h
@@ -0,0 +1,46 @@
+#ifndef LAPACK_C_ROT_HPP
+#define LAPACK_C_ROT_HPP
+
+#include "lapack_c/util.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+void lapack_c_drot(lapack_idx n,
+                    double* x,
+                    lapack_idx incx,
+                    double* y,
+                    lapack_idx incy,
+                    double c,
+                    double s);
+
+void lapack_c_srot(lapack_idx n,
+                    float* x,
+                    lapack_idx incx,
+                    float* y,
+                    lapack_idx incy,
+                    float c,
+                    float s);
+
+void lapack_c_crot(lapack_idx n,
+                    lapack_float_complex* x,
+                    lapack_idx incx,
+                    lapack_float_complex* y,
+                    lapack_idx incy,
+                    float c,
+                    lapack_float_complex s);
+
+void lapack_c_zrot(lapack_idx n,
+                    lapack_double_complex* x,
+                    lapack_idx incx,
+                    lapack_double_complex* y,
+                    lapack_idx incy,
+                    double c,
+                    lapack_double_complex s);
+
+#ifdef __cplusplus
+}
+#endif  // __cplusplus
+
+#endif
\ No newline at end of file
diff --git a/LAPACK_CPP/include/lapack_c/util.h b/LAPACK_CPP/include/lapack_c/util.h
new file mode 100644
index 000000000..9a874ae37
--- /dev/null
+++ b/LAPACK_CPP/include/lapack_c/util.h
@@ -0,0 +1,19 @@
+#ifndef LAPACK_C_UTIL_H
+#define LAPACK_C_UTIL_H
+
+// Define types for complex number
+#ifdef __cplusplus
+#include <complex>
+#define lapack_float_complex std::complex<float>
+#define lapack_double_complex std::complex<double>
+#else
+#define lapack_float_complex float _Complex
+#define lapack_double_complex double _Complex
+#endif // __cplusplus
+
+// Default to 32 bit integer if not yet defined
+#ifndef lapack_idx
+#define lapack_idx int
+#endif // lapack_idx
+
+#endif
\ No newline at end of file
diff --git a/LAPACK_CPP/include/lapack_cpp.hpp b/LAPACK_CPP/include/lapack_cpp.hpp
new file mode 100644
index 000000000..d54c2b9d8
--- /dev/null
+++ b/LAPACK_CPP/include/lapack_cpp.hpp
@@ -0,0 +1,9 @@
+#ifndef LAPACK_CPP_HPP
+#define LAPACK_CPP_HPP
+
+#include "lapack_cpp/base.hpp"
+#include "lapack_cpp/blas.hpp"
+#include "lapack_cpp/lapack.hpp"
+#include "lapack_cpp/utils.hpp"
+
+#endif // LAPACK_CPP_HPP
diff --git a/LAPACK_CPP/include/lapack_cpp/base.hpp b/LAPACK_CPP/include/lapack_cpp/base.hpp
new file mode 100644
index 000000000..ec7dd5e49
--- /dev/null
+++ b/LAPACK_CPP/include/lapack_cpp/base.hpp
@@ -0,0 +1,10 @@
+#ifndef LAPACK_CPP_BASE_HPP
+#define LAPACK_CPP_BASE_HPP
+
+#include "lapack_cpp/base/types.hpp"
+#include "lapack_cpp/base/enums.hpp"
+#include "lapack_cpp/base/matrix.hpp"
+#include "lapack_cpp/base/memory_block.hpp"
+#include "lapack_cpp/base/vector.hpp"
+
+#endif // LAPACK_CPP_BASE_HPP
\ No newline at end of file
diff --git a/LAPACK_CPP/include/lapack_cpp/base/enums.hpp b/LAPACK_CPP/include/lapack_cpp/base/enums.hpp
new file mode 100644
index 000000000..d3f929aca
--- /dev/null
+++ b/LAPACK_CPP/include/lapack_cpp/base/enums.hpp
@@ -0,0 +1,84 @@
+#ifndef LAPACK_CPP_ENUMP_HPP
+#define LAPACK_CPP_ENUMP_HPP
+
+#include <cassert>
+
+namespace lapack_cpp {
+
+enum class Layout : char { RowMajor = 'R', ColMajor = 'C' };
+
+enum class Op : char { NoTrans = 'N', Trans = 'T', ConjTrans = 'C' };
+
+inline constexpr Op char2op(char t)
+{
+    switch (t) {
+        case 'N':
+            return Op::NoTrans;
+        case 'T':
+            return Op::Trans;
+        case 'C':
+            return Op::ConjTrans;
+        default:
+            assert(false);
+    }
+}
+
+enum class Side : char { Left = 'L', Right = 'R' };
+
+inline constexpr Side char2side(char t)
+{
+    switch (t) {
+        case 'L':
+            return Side::Left;
+        case 'R':
+            return Side::Right;
+        default:
+            assert(false);
+    }
+}
+
+enum class Uplo : char { Upper = 'U', Lower = 'L' };
+
+inline constexpr Uplo char2uplo(char t)
+{
+    switch (t) {
+        case 'U':
+            return Uplo::Upper;
+        case 'R':
+            return Uplo::Lower;
+        default:
+            assert(false);
+    }
+}
+
+enum class Diag : char { Unit = 'U', NonUnit = 'N' };
+
+inline constexpr Diag char2diag(char t)
+{
+    switch (t) {
+        case 'U':
+            return Diag::Unit;
+        case 'N':
+            return Diag::NonUnit;
+        default:
+            assert(false);
+    }
+}
+
+enum class Direction : char { Forward = 'F', Backward = 'B' };
+
+inline constexpr Direction char2direct(char t)
+{
+    switch (t) {
+        case 'F':
+            return Direction::Forward;
+        case 'N':
+            return Direction::Backward;
+        default:
+            assert(false);
+    }
+}
+
+}  // namespace lapack_cpp
+
+#endif  // LAPACK_CPP_ENUMP_HPP
\ No newline at end of file
diff --git a/LAPACK_CPP/include/lapack_cpp/base/matrix.hpp b/LAPACK_CPP/include/lapack_cpp/base/matrix.hpp
new file mode 100644
index 000000000..ad10aae8c
--- /dev/null
+++ b/LAPACK_CPP/include/lapack_cpp/base/matrix.hpp
@@ -0,0 +1,334 @@
+
+#ifndef LAPACK_CPP_MATRIX_HPP
+#define LAPACK_CPP_MATRIX_HPP
+
+#include <algorithm>
+#include <cassert>
+#include <chrono>
+#include <functional>
+#include <iostream>
+#include <random>
+
+#include "lapack_cpp/base/types.hpp"
+#include "lapack_cpp/base/enums.hpp"
+#include "lapack_cpp/base/memory_block.hpp"
+#include "lapack_cpp/base/vector.hpp"
+
+namespace lapack_cpp {
+
+// Forward declaration of ConstMatrix
+template <typename T, Layout layout = Layout::ColMajor, typename idx_t = lapack_idx_t>
+class ConstMatrix;
+
+/**
+ * A matrix class that can be used to store matrices of arbitrary size.
+ *
+ * The elements of the matrix are stored in column-major order
+ *
+ * @tparam T this is a template parameter that specifies the type of the
+ *           elements of the vector.
+ */
+template <typename T, Layout layout = Layout::ColMajor, typename idx_t = lapack_idx_t>
+class Matrix {
+   public:
+    typedef T value_type;
+    typedef idx_t index_type;
+    // Constructor for a matrix of size m x n
+    template <bool aligned>
+    Matrix(const idx_t m,
+           const idx_t n,
+           const MemoryBlock<T, idx_t, aligned>& m_block)
+        : m_(m),
+          n_(n),
+          ldim_(calc_ld<T, idx_t, aligned>(layout == Layout::ColMajor ? m_ : n_)),
+          data_(m_block.ptr())
+    {
+        assert(m_block.size() >=
+               ldim_ * (layout == Layout::ColMajor ? n_ : m_));
+    }
+
+    // Constructor for a matrix of size m x n
+    template <bool aligned>
+    Matrix(const idx_t m,
+           const idx_t n,
+           const MemoryBlock<T, idx_t, aligned>& m_block,
+           const idx_t ldim)
+        : m_(m), n_(n), ldim_(ldim), data_(m_block.ptr())
+    {
+        assert(m_block.size() >=
+               ldim_ * (layout == Layout::ColMajor ? n_ : m_));
+    }
+
+    // Constructor for a matrix of size m x n
+    Matrix(const idx_t m, const idx_t n, T* ptr, const idx_t ldim)
+        : m_(m), n_(n), ldim_(ldim), data_(ptr)
+    {
+        assert(ldim >= (layout == Layout::ColMajor ? m : n));
+    }
+
+    // Copy constructor
+    Matrix(const Matrix& m)
+        : m_(m.num_rows()), n_(m.num_columns()), ldim_(m.ldim()), data_(m.ptr())
+    {}
+
+    // Make a const promotion of the matrix
+    ConstMatrix<T, layout, idx_t> as_const() const
+    {
+        return ConstMatrix<T, layout, idx_t>(m_, n_, data_, ldim_);
+    }
+
+    // Assignment operator
+    void operator=(const Matrix& m)
+    {
+        assert(m.num_rows() == m_);
+        assert(m.num_columns() == n_);
+        for (idx_t i = 0; i < m_; ++i) {
+            for (idx_t j = 0; j < n_; ++j) {
+                (*this)(i,j) = m(i, j);
+            }
+        }
+    }
+
+    // Returns the number of rows of the matrix
+    inline idx_t num_rows() const { return m_; }
+
+    // Returns the number of columns of the matrix
+    inline idx_t num_columns() const { return n_; }
+
+    // Returns the leading dimension of the matrix
+    inline idx_t ldim() const { return ldim_; }
+
+    // Returns the size of the matrix (i.e. the number of elements m * n)
+    inline idx_t size() const { return n_ * m_; }
+
+    // Returns the ij-th element of the matrix
+    inline T& operator()(idx_t i, idx_t j) const
+    {
+        assert(i < m_);
+        assert(j < n_);
+        if (layout == Layout::ColMajor)
+            return data_[i + j * ldim_];
+        else
+            return data_[i * ldim_ + j];
+    }
+
+    /**
+     * Return a submatrix of the matrix.
+     *
+     * @param i1 index of the first row of the submatrix
+     * @param i2 index of the last row of the submatrix (not inclusive)
+     * @param j1 index of the first column of the submatrix
+     * @param j2 index of the last column of the submatrix (not inclusive)
+     *
+     * @example
+     * Matrix<float> A(5,5);
+     * auto B = A.submatrix(1, 4, 1, 4); // B is a 3x3 submatrix of A
+     */
+    inline Matrix submatrix(idx_t i1, idx_t i2, idx_t j1, idx_t j2) const
+    {
+        assert(i1 >= 0);
+        assert(i2 <= m_);
+        assert(j1 >= 0);
+        assert(j2 <= n_);
+        assert(i1 < i2);
+        assert(j1 < j2);
+        if (layout == Layout::ColMajor)
+            return Matrix(i2 - i1, j2 - j1, &data_[i1 + j1 * ldim_], ldim_);
+        else
+            return Matrix(i2 - i1, j2 - j1, &data_[i1 * ldim_ + j1], ldim_);
+    }
+
+    /**
+     * Return a column of the matrix as a vector
+     *
+     * @param i index of the column
+     */
+    inline Vector<T, idx_t> column(idx_t i) const
+    {
+        assert(i >= 0);
+        assert(i < n_);
+        if (layout == Layout::ColMajor)
+            return Vector<T, idx_t>(m_, &data_[i * ldim_], 1);
+        else
+            return Vector<T, idx_t>(m_, &data_[i], ldim_);
+    }
+
+    /**
+     * Return a row of the matrix as a vector
+     *
+     * @param i index of the row
+     */
+    inline Vector<T, idx_t> row(idx_t i) const
+    {
+        assert(i >= 0);
+        assert(i < m_);
+        if (layout == Layout::ColMajor)
+            return Vector<T, idx_t>(n_, &data_[i], ldim_);
+        else
+            return Vector<T, idx_t>(n_, &data_[i * ldim_], 1);
+    }
+
+    /**
+     * Return a pointer to the data of the matrix.
+     */
+    inline T* ptr() const { return data_; }
+
+   private:
+    // Number of rows of the matrix
+    const idx_t m_;
+    // Number of columns of the matrix
+    const idx_t n_;
+    // Leading dimension of the matrix
+    const idx_t ldim_;
+    // Pointer to the data
+    T* data_;
+};
+
+template <typename T, Layout layout, typename idx_t >
+class ConstMatrix {
+   public:
+    typedef T value_type;
+    typedef idx_t index_type;
+    // Constructor for a matrix of size m x n
+    template <bool aligned>
+    ConstMatrix(const idx_t m,
+                const idx_t n,
+                const MemoryBlock<T, idx_t, aligned>& m_block)
+        : m_(m),
+          n_(n),
+          ldim_(calc_ld<T, idx_t, aligned>(layout == Layout::ColMajor ? m_ : n_)),
+          data_(m_block.ptr())
+    {
+        assert(m_block.size() >=
+               ldim_ * (layout == Layout::ColMajor ? n_ : m_));
+    }
+
+    // Constructor for a matrix of size m x n
+    template <bool aligned>
+    ConstMatrix(const idx_t m,
+                const idx_t n,
+                const MemoryBlock<T, idx_t, aligned>& m_block,
+                const idx_t ldim)
+        : m_(m), n_(n), ldim_(ldim), data_(m_block.ptr())
+    {
+        assert(m_block.size() >=
+               ldim_ * (layout == Layout::ColMajor ? n_ : m_));
+    }
+
+    // Constructor for a matrix of size m x n
+    ConstMatrix(const idx_t m, const idx_t n, const T* ptr, const idx_t ldim)
+        : m_(m), n_(n), ldim_(ldim), data_(ptr)
+    {
+        assert(ldim >= (layout == Layout::ColMajor ? m : n));
+    }
+
+    // Copy constructor
+    ConstMatrix(const ConstMatrix& m)
+        : m_(m.num_rows()), n_(m.num_columns()), ldim_(m.ldim()), data_(m.ptr())
+    {}
+
+    // Const promotion constructor
+    ConstMatrix(const Matrix<T, layout, idx_t>& m)
+        : m_(m.num_rows()), n_(m.num_columns()), ldim_(m.ldim()), data_(m.ptr())
+    {}
+
+    // Returns the number of rows of the matrix
+    inline idx_t num_rows() const { return m_; }
+
+    // Returns the number of columns of the matrix
+    inline idx_t num_columns() const { return n_; }
+
+    // Returns the leading dimension of the matrix
+    inline idx_t ldim() const { return ldim_; }
+
+    // Returns the size of the matrix (i.e. the number of elements m * n)
+    inline idx_t size() const { return n_ * m_; }
+
+    // Returns the ij-th element of the matrix
+    inline T operator()(idx_t i, idx_t j) const
+    {
+        assert(i < m_);
+        assert(j < n_);
+        if (layout == Layout::ColMajor)
+            return data_[i + j * ldim_];
+        else
+            return data_[i * ldim_ + j];
+    }
+
+    /**
+     * Return a submatrix of the matrix.
+     *
+     * @param i1 index of the first row of the submatrix
+     * @param i2 index of the last row of the submatrix (not inclusive)
+     * @param j1 index of the first column of the submatrix
+     * @param j2 index of the last column of the submatrix (not inclusive)
+     *
+     * @example
+     * Matrix<float> A(5,5);
+     * auto B = A.submatrix(1, 4, 1, 4); // B is a 3x3 submatrix of A
+     */
+    inline ConstMatrix submatrix(idx_t i1, idx_t i2, idx_t j1, idx_t j2) const
+    {
+        assert(i1 >= 0);
+        assert(i2 <= m_);
+        assert(j1 >= 0);
+        assert(j2 <= n_);
+        assert(i1 < i2);
+        assert(j1 < j2);
+        if (layout == Layout::ColMajor)
+            return ConstMatrix(i2 - i1, j2 - j1, &data_[i1 + j1 * ldim_],
+                               ldim_);
+        else
+            return ConstMatrix(i2 - i1, j2 - j1, &data_[i1 * ldim_ + j1],
+                               ldim_);
+    }
+
+    /**
+     * Return a column of matrix as a vector
+     *
+     * @param i index of the column
+     */
+    inline ConstVector<T, idx_t> column(idx_t i) const
+    {
+        assert(i >= 0);
+        assert(i < n_);
+        if (layout == Layout::ColMajor)
+            return ConstVector<T, idx_t>(m_, &data_[i * ldim_], 1);
+        else
+            return ConstVector<T, idx_t>(m_, &data_[i], ldim_);
+    }
+
+    /**
+     * Return a row of matrix as a vector
+     *
+     * @param i index of the row
+     */
+    inline ConstVector<T, idx_t> row(idx_t i) const
+    {
+        assert(i >= 0);
+        assert(i < m_);
+        if (layout == Layout::ColMajor)
+            return ConstVector<T, idx_t>(n_, &data_[i], ldim_);
+        else
+            return ConstVector<T, idx_t>(n_, &data_[i * ldim_], 1);
+    }
+
+    /**
+     * Return a pointer to the data of the matrix.
+     */
+    inline const T* ptr() const { return data_; }
+
+   private:
+    // Number of rows of the matrix
+    const idx_t m_;
+    // Number of columns of the matrix
+    const idx_t n_;
+    // Leading dimension of the matrix
+    const idx_t ldim_;
+    // Pointer to the data
+    const T* data_;
+};
+
+}  // namespace lapack_cpp
+
+#endif  // LAPACK_CPP_MATRIX_HPP
\ No newline at end of file
diff --git a/LAPACK_CPP/include/lapack_cpp/base/memory_block.hpp b/LAPACK_CPP/include/lapack_cpp/base/memory_block.hpp
new file mode 100644
index 000000000..c2ccb90a2
--- /dev/null
+++ b/LAPACK_CPP/include/lapack_cpp/base/memory_block.hpp
@@ -0,0 +1,136 @@
+
+#ifndef LAPACK_CPP_MEMORY_BLOCK_HPP
+#define LAPACK_CPP_MEMORY_BLOCK_HPP
+
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <type_traits>
+
+#include "lapack_cpp/base/types.hpp"
+
+namespace lapack_cpp {
+
+/**
+ * Calculate the leading dimension of a matrix, given the number of rows.
+ * Normally, the leading dimension can just be equal to to the number of rows,
+ * but for performance reasons, we want to make sure that the leading dimension
+ * is a multiple of 64 bytes. That way, if the first column is aligned, they all
+ * are.
+ *
+ * @tparam T
+ * @param m
+ * @return int
+ */
+template <typename T, typename idx_t, bool aligned = true>
+inline constexpr idx_t calc_ld(const idx_t m)
+{
+    if (aligned)
+        return ((m - 1 + (64 / sizeof(T))) / (64 / sizeof(T))) *
+               (64 / sizeof(T));
+    else
+        return m;
+}
+
+/**
+ * A class representing a contiguous block of memory of fixed size.
+ *
+ * This class owns the data. It is responsible for allocating and deallocating
+ * the memory.
+ *
+ * A special note about const. Declaring a Memory block as const does not make
+ * the data const. For example:
+ * const MemoryBlock<double> A(10);
+ * double* data = A.ptr();
+ * data[0] = 1.0; // This is allowed
+ */
+template <typename T, typename idx_t = lapack_idx_t, bool aligned = true>
+class MemoryBlock {
+   public:
+    // Constructor for a block of size n
+    MemoryBlock(idx_t n)
+        : n_(n),
+          data_(aligned ? (T*)aligned_alloc(64, n_ * sizeof(T))
+                        : (T*)malloc(n_ * sizeof(T)))
+    {
+        assert(n >= 0);
+        #ifndef NDEBUG
+            // When debugging, fill the memory with NaNs
+            // This makes it easier to spot uninitialized memory
+            std::fill(data_, data_ + n_, NAN);
+        #endif
+    }
+
+    // Constructor for a block of sufficient size to store a matrix of size m x
+    // n
+    MemoryBlock(idx_t m, idx_t n, Layout layout = Layout::ColMajor)
+        : n_(layout == Layout::ColMajor ? (calc_ld<T, idx_t, aligned>(m) * n) : (calc_ld<T, idx_t, aligned>(n) * m)),
+          data_(aligned ? (T*)aligned_alloc(64, n_ * sizeof(T))
+                        : (T*)malloc(n_ * sizeof(T)))
+    {
+        assert(m >= 0);
+        assert(n >= 0);
+        #ifndef NDEBUG
+            // When debugging, fill the memory with NaNs
+            // This makes it easier to spot uninitialized memory
+            std::fill(data_, data_ + n_, NAN);
+        #endif
+    }
+
+    // Non-owning constructor
+    MemoryBlock(idx_t n, T* data) : n_(n), data_(data), owns_data_(false)
+    {
+        assert(n >= 0);
+    }
+
+    // Destructor
+    ~MemoryBlock()
+    {
+        if (owns_data_) free((std::remove_const_t<T>*)data_);
+    }
+
+    // Copy constructor
+    MemoryBlock(const MemoryBlock& other)
+        : n_(other.n_), data_(other.data_), owns_data_(false)
+    {}
+
+    // Assignment operator
+    void operator=(const MemoryBlock& other)
+    {
+        assert(other.size() == this->size());
+        std::copy(other.data_, other.data_ + n_, data_);
+    }
+
+    // Memory block that remains after creating a vector of size n
+    MemoryBlock remainder(idx_t n) const
+    {
+        idx_t size = calc_ld<T, idx_t, aligned>(n);
+        assert(size <= n_);
+        return MemoryBlock(n_ - n, data_ + n);
+    }
+
+    // Memory block that remains after creating a vector of size n
+    MemoryBlock remainder(idx_t m, idx_t n) const
+    {
+        idx_t size = calc_ld<T, idx_t, aligned>(calc_ld<T, idx_t, aligned>(m) * n);
+        assert(size <= n_);
+        return MemoryBlock(n_ - size, data_ + size);
+    }
+
+    // Returns the size of the memory block
+    inline idx_t size() const { return n_; }
+
+    /**
+     * Return a pointer to the data of the vector.
+     */
+    inline T* ptr() const { return data_; }
+
+   private:
+    const idx_t n_;
+    T* data_;
+    const bool owns_data_ = true;
+};
+
+}  // namespace lapack_cpp
+
+#endif
\ No newline at end of file
diff --git a/LAPACK_CPP/include/lapack_cpp/base/types.hpp b/LAPACK_CPP/include/lapack_cpp/base/types.hpp
new file mode 100644
index 000000000..2ce546bae
--- /dev/null
+++ b/LAPACK_CPP/include/lapack_cpp/base/types.hpp
@@ -0,0 +1,33 @@
+#ifndef LAPACK_CPP_BASE_TYPES_HPP
+#define LAPACK_CPP_BASE_TYPES_HPP
+
+#include <complex>
+#include <cstdint>
+#include <type_traits>
+
+// While the library is templated and can handle different integers,
+// this is the default integer type used by the library.
+typedef int64_t lapack_idx_t;
+
+// declare conj for real types
+template <typename T,
+          std::enable_if_t<std::is_floating_point_v<T>, bool> = true>
+inline T conj(const T& x)
+{
+    return x;
+}
+
+template <typename T>
+struct real_type_struct {
+    typedef T type;
+};
+
+template <typename T>
+struct real_type_struct<std::complex<T>> {
+    typedef T type;
+};
+
+template <typename T>
+using real_t = typename real_type_struct<T>::type;
+
+#endif  // LAPACK_CPP_BASE_TYPES_HPP
\ No newline at end of file
diff --git a/LAPACK_CPP/include/lapack_cpp/base/vector.hpp b/LAPACK_CPP/include/lapack_cpp/base/vector.hpp
new file mode 100644
index 000000000..d0d723afd
--- /dev/null
+++ b/LAPACK_CPP/include/lapack_cpp/base/vector.hpp
@@ -0,0 +1,203 @@
+
+#ifndef LAPACK_CPP_VECTOR_HPP
+#define LAPACK_CPP_VECTOR_HPP
+
+#include <algorithm>
+#include <cassert>
+#include <chrono>
+
+#include "lapack_cpp/base/types.hpp"
+#include "lapack_cpp/base/enums.hpp"
+#include "lapack_cpp/base/memory_block.hpp"
+
+namespace lapack_cpp {
+
+// Forward declaration of ConstVector
+template <typename T, typename idx_t = lapack_idx_t>
+class ConstVector;
+
+/**
+ * A vector class that can be used to store vectors of arbitrary size.
+ *
+ * @tparam T this is a template parameter that specifies the type of the
+ *           elements of the vector.
+ */
+template <typename T, typename idx_t = lapack_idx_t>
+class Vector {
+   public:
+    typedef T value_type;
+    typedef idx_t index_type;
+
+    // Constructor for a vector of size n
+    template <bool aligned>
+    Vector(idx_t n,
+           const MemoryBlock<T, idx_t, aligned>& m_block,
+           idx_t stride = 1)
+        : n_(n), stride_(stride), data_(m_block.ptr())
+    {
+        assert(n >= 0);
+        assert(m_block.size() >= n);
+    }
+
+    // Constructor for a vector of size n
+    Vector(idx_t n, T* data, idx_t stride = 1)
+        : n_(n), stride_(stride), data_(data)
+    {
+        assert(n >= 0);
+    }
+
+    // Copy constructor
+    // Note: the default copy constructor would probably work, but we want to
+    // emphasize that we are doing a shallow copy here.
+    Vector(const Vector<T>& v) : n_(v.n_), stride_(v.stride_), data_(v.data_) {}
+
+    // Assignment operator
+    void operator=(const Vector<T>& v)
+    {
+        assert(v.size() == this->size());
+        for (int i = 0; i < this->size(); ++i) {
+            (*this)[i] = v[i];
+        }
+    }
+
+    // Make a const promotion of the vector
+    inline ConstVector<T, idx_t> as_const() const
+    {
+        return ConstVector<T, idx_t>(n_, data_, stride_);
+    }
+
+    // Returns the size of the vector
+    inline idx_t size() const { return n_; }
+
+    inline idx_t stride() const { return stride_; }
+
+    // Returns the i-th element of the vector
+    // Note: this function will only be called when the vector is not const
+    // therefore, we return the element by reference.
+    inline T& operator[](const idx_t i) const
+    {
+        assert(i < n_);
+        return data_[i * stride_];
+    }
+
+    /**
+     * The subvector starts at index start and ends at index end - 1.
+     * The stride is the step size between two elements of the subvector.
+     */
+    inline Vector subvector(idx_t start, idx_t end, idx_t stride = 1) const
+    {
+        assert(start >= 0);
+        assert(end <= n_);
+        assert(start < end);
+        assert(stride > 0);
+        return Vector((end - start) / stride, &data_[start * stride_],
+                      stride * stride_);
+    }
+
+    /**
+     * Return a pointer to the data of the vector.
+     */
+    inline T* ptr() const { return data_; }
+
+   private:
+    const idx_t n_;
+    const idx_t stride_;
+    T* data_;
+};
+
+/**
+ * A vector class that can be used to store vectors of arbitrary size.
+ *
+ * @tparam T this is a template parameter that specifies the type of the
+ *           elements of the vector.
+ */
+template <typename T, typename idx_t>
+class ConstVector {
+   public:
+    typedef T value_type;
+    typedef idx_t index_type;
+    // Constructor for a vector of size n
+    template <bool aligned>
+    ConstVector(idx_t n,
+                const MemoryBlock<T, idx_t, aligned>& m_block,
+                idx_t stride = 1)
+        : n_(n), stride_(stride), data_(m_block.ptr())
+    {
+        assert(n >= 0);
+        assert(m_block.size() >= n);
+    }
+
+    // Constructor for a vector of size n
+    ConstVector(idx_t n, const T* data, idx_t stride = 1)
+        : n_(n), stride_(stride), data_(data)
+    {
+        assert(n >= 0);
+    }
+
+    // Copy constructor
+    // Note: the default copy constructor would probably work, but we want to
+    // emphasize that we are doing a shallow copy here.
+    ConstVector(const ConstVector<T, idx_t>& v)
+        : n_(v.n_), stride_(v.stride_), data_(v.data_)
+    {}
+
+    // Const promotion constructor
+    ConstVector(const Vector<T, idx_t>& v)
+        : n_(v.size()), stride_(v.stride()), data_(v.ptr())
+    {}
+
+    // Assignment operator
+    void operator=(const Vector<T>& v)
+    {
+        assert(v.size() == this->size());
+        for (int i = 0; i < this->size(); ++i) {
+            (*this)[i] = v[i];
+        }
+    }
+
+    // Returns the size of the vector
+    inline idx_t size() const { return n_; }
+
+    inline idx_t stride() const { return stride_; }
+
+    // Returns the i-th element of the vector
+    // Note: this function will only be called when the vector is not const
+    // therefore, we return the element by reference.
+    inline T operator[](const idx_t i) const
+    {
+        assert(i < n_);
+        return data_[i * stride_];
+    }
+
+    /**
+     * The subvector starts at index start and ends at index end - 1.
+     * The stride is the step size between two elements of the subvector.
+     *
+     * Note: this function will also be called if the matrix is const, even
+     * though the returned object is not const. Solving this problem would
+     * require either an inefficient or a more complex design.
+     */
+    inline ConstVector subvector(idx_t start, idx_t end, idx_t stride = 1) const
+    {
+        assert(start >= 0);
+        assert(end <= n_);
+        assert(start < end);
+        assert(stride > 0);
+        return ConstVector((end - start) / stride, &data_[start * stride_],
+                           stride * stride_);
+    }
+
+    /**
+     * Return a pointer to the data of the vector.
+     */
+    inline const T* ptr() const { return data_; }
+
+   private:
+    const idx_t n_;
+    const idx_t stride_;
+    const T* data_;
+};
+
+}  // namespace lapack_cpp
+
+#endif
\ No newline at end of file
diff --git a/LAPACK_CPP/include/lapack_cpp/blas.hpp b/LAPACK_CPP/include/lapack_cpp/blas.hpp
new file mode 100644
index 000000000..b2d996ee0
--- /dev/null
+++ b/LAPACK_CPP/include/lapack_cpp/blas.hpp
@@ -0,0 +1,6 @@
+#ifndef LAPACK_CPP_BLAS_HPP
+#define LAPACK_CPP_BLAS_HPP
+
+#include "lapack_cpp/blas/rot.hpp"
+
+#endif // LAPACK_CPP_BLAS_HPP
\ No newline at end of file
diff --git a/LAPACK_CPP/include/lapack_cpp/blas/rot.hpp b/LAPACK_CPP/include/lapack_cpp/blas/rot.hpp
new file mode 100644
index 000000000..29db48f66
--- /dev/null
+++ b/LAPACK_CPP/include/lapack_cpp/blas/rot.hpp
@@ -0,0 +1,24 @@
+#ifndef LAPACK_CPP_ROT_HPP
+#define LAPACK_CPP_ROT_HPP
+
+#include "lapack_cpp/base.hpp"
+#include "lapack_cpp/utils.hpp"
+namespace lapack_cpp {
+
+/**
+ * Apply a plane rotation to a pair of vectors.
+ */
+template <typename T, typename TS, typename idx_t>
+void rot(const Vector<T, idx_t>& x, const Vector<T, idx_t>& y, real_t<T> c, TS s){
+    assert(x.size() == y.size());
+    const idx_t n = x.size();
+    for (idx_t i = 0; i < n; ++i) {
+        T temp = -conj(s) * x[i] + c * y[i];
+        x[i] = c * x[i] + s * y[i];
+        y[i] = temp;
+    }
+}
+
+}  // namespace lapack_cpp
+
+#endif  // LAPACK_CPP_GEMV_HPP
\ No newline at end of file
diff --git a/LAPACK_CPP/include/lapack_cpp/lapack.hpp b/LAPACK_CPP/include/lapack_cpp/lapack.hpp
new file mode 100644
index 000000000..31e03ce78
--- /dev/null
+++ b/LAPACK_CPP/include/lapack_cpp/lapack.hpp
@@ -0,0 +1,7 @@
+#ifndef LAPACK_CPP_LAPACK_HPP
+#define LAPACK_CPP_LAPACK_HPP
+
+#include "lapack_cpp/lapack/lartg.hpp"
+#include "lapack_cpp/lapack/lasr3.hpp"
+
+#endif // LAPACK_CPP_LAPACK_HPP
\ No newline at end of file
diff --git a/LAPACK_CPP/include/lapack_cpp/lapack/lartg.hpp b/LAPACK_CPP/include/lapack_cpp/lapack/lartg.hpp
new file mode 100644
index 000000000..5f5b345dd
--- /dev/null
+++ b/LAPACK_CPP/include/lapack_cpp/lapack/lartg.hpp
@@ -0,0 +1,16 @@
+#ifndef LAPACK_CPP_LARTG_HPP
+#define LAPACK_CPP_LARTG_HPP
+
+#include "lapack_cpp/base.hpp"
+#include "lapack_cpp/utils.hpp"
+namespace lapack_cpp {
+
+/**
+ * Generate a plane rotation.
+ */
+template <typename T>
+void lartg(T f, T g, real_t<T>& c, T& s, T& r);
+
+}  // namespace lapack_cpp
+
+#endif  // LAPACK_CPP_GEMV_HPP
\ No newline at end of file
diff --git a/LAPACK_CPP/include/lapack_cpp/lapack/lasr3.hpp b/LAPACK_CPP/include/lapack_cpp/lapack/lasr3.hpp
new file mode 100644
index 000000000..0a3ffbb8a
--- /dev/null
+++ b/LAPACK_CPP/include/lapack_cpp/lapack/lasr3.hpp
@@ -0,0 +1,815 @@
+#ifndef LAPACK_CPP_LASR3_HPP
+#define LAPACK_CPP_LASR3_HPP
+
+#include "lapack_cpp/base.hpp"
+#include "lapack_cpp/blas/rot.hpp"
+#include "lapack_cpp/utils.hpp"
+
+namespace lapack_cpp {
+
+/**
+ * Applies a sequence of plane rotations to a general rectangular matrix.
+ *
+ * @param side   Specifies whether the plane rotations are applied to A on the
+ *               left or right.
+ *
+ * @param direct Specifies whether the sequence is applied in forward or
+ *               backward order.
+ *
+ * @param C      The cosines of the rotations
+ *               If side == Side::Left, C is an array of dimension m-1 x k
+ *               If side == Side::Right, C is an array of dimension n-1 x k
+ *
+ * @param S      The sines of the rotations
+ *               If side == Side::Left, S is an array of dimension m-1 x k
+ *               If side == Side::Right, S is an array of dimension n-1 x k
+ *
+ * @param A      m x n matrix to which the rotations are to be applied.
+ *
+ * @param work   A workspace array, its dimension is specified by lasr3_work.
+ *               Note, because we use the work array to align the data for
+ *               efficient vectorization, this routine only support aligned
+ *               work arrays.
+ *
+ * @tparam T     The type of the elements of the matrix A.
+ *
+ * @tparam TC    The type of the elements of the matrix C.
+ *
+ * @tparam TS    The type of the elements of the matrix S.
+ *
+ * @tparam layout The layout of the matrices A, C, and S.
+ *
+ * @tparam idx_t  The type of the indices.
+ *
+ * @note if either TC or TS is a complex type, then T must also be a complex
+ * type.
+ *
+ */
+template <typename T, typename TC, typename TS, Layout layout, typename idx_t>
+void lasr3(Side side,
+           Direction direct,
+           ConstMatrix<TC, layout, idx_t> C,
+           ConstMatrix<TS, layout, idx_t> S,
+           Matrix<T, layout, idx_t> A,
+           MemoryBlock<T, idx_t, true> work);
+
+/**
+ * Workspace query for lasr3.
+ */
+template <typename T, typename TC, typename TS, Layout layout, typename idx_t>
+idx_t lasr3_workquery(Side side,
+                      Direction direct,
+                      ConstMatrix<TC, layout, idx_t> C,
+                      ConstMatrix<TS, layout, idx_t> S,
+                      Matrix<T, layout, idx_t> A)
+{
+    const idx_t m = A.num_rows();
+    const idx_t n = A.num_columns();
+    const idx_t k = C.num_columns();
+    const idx_t num_rot = C.num_rows();
+
+    assert(side == Side::Left || side == Side::Right);
+    assert(direct == Direction::Forward || direct == Direction::Backward);
+    assert(C.num_rows() == S.num_rows());
+    assert(C.num_columns() == S.num_columns());
+    assert(side == Side::Left ? m : n == C.num_rows() + 1);
+
+    // Blocksize for the number of rows/columns of A to be processed at once
+    const idx_t block_mn = 256;
+    // Blocksize for the number of rotation sequences to be processed at once
+    const idx_t block_k = 64;
+    const idx_t block_k2 = std::min(num_rot - 3, block_k);
+
+    const idx_t nm2 = std::min(block_mn, side == Side::Left ? n : m);
+    const idx_t k2 = std::min(block_k2, k);
+
+    return calc_ld<T, idx_t>(nm2) * (k2 + 2);
+}
+
+/**
+ * @copydoc lasr3
+ */
+template <typename T, typename TC, typename TS, Layout layout, typename idx_t>
+void lasr3(Side side,
+           Direction direct,
+           ConstMatrix<TC, layout, idx_t> C,
+           ConstMatrix<TS, layout, idx_t> S,
+           Matrix<T, layout, idx_t> A)
+{
+    idx_t lwork = lasr3_workquery(side, direct, C, S, A);
+    MemoryBlock<T, idx_t, true> work(lwork);
+    lasr3(side, direct, C, S, A, work);
+}
+
+namespace internal {
+
+    /** Applies two plane rotations to three vectors.
+     *
+     * [x1]   [1 0   0  ]   [c1  s1 0]   [x1]
+     * [x2] = [0 c2  s2 ] * [-s1 c1 0] * [x2]
+     * [x3]   [0 -s2 c2 ]   [0   0  1]   [x3]
+     *
+     * @param x1  The first vector.
+     * @param x2  The second vector.
+     * @param x3  The third vector.
+     * @param c1  The cosine of the first rotation.
+     * @param s1  The sine of the first rotation.
+     * @param c2  The cosine of the second rotation.
+     * @param s2  The sine of the second rotation.
+     */
+    template <typename T, typename TC, typename TS, typename idx_t>
+    void rot_fuse2x1(Vector<T, idx_t> x1,
+                     Vector<T, idx_t> x2,
+                     Vector<T, idx_t> x3,
+                     TC c1,
+                     TS s1,
+                     TC c2,
+                     TS s2)
+    {
+        assert(x1.size() == x2.size() && x2.size() == x3.size());
+        const idx_t n = x1.size();
+        for (idx_t i = 0; i < n; ++i) {
+            T x2_g1 = -conj(s1) * x1[i] + c1 * x2[i];
+            x1[i] = c1 * x1[i] + s1 * x2[i];
+            x2[i] = c2 * x2_g1 + s2 * x3[i];
+            x3[i] = -conj(s2) * x2_g1 + c2 * x3[i];
+        }
+    }
+
+    /** Applies two plane rotations to three vectors.
+     *
+     * [x1]   [c2  s2 0]   [1 0   0  ]   [x1]
+     * [x2] = [-s2 c2 0] * [0 c1  s1 ] * [x2]
+     * [x3]   [0   0  1]   [0 -s1 c1 ]   [x3]
+     *
+     * @param x1  The first vector.
+     * @param x2  The second vector.
+     * @param x3  The third vector.
+     * @param c1  The cosine of the first rotation.
+     * @param s1  The sine of the first rotation.
+     * @param c2  The cosine of the second rotation.
+     * @param s2  The sine of the second rotation.
+     */
+    template <typename T, typename TC, typename TS, typename idx_t>
+    void rot_fuse1x2(Vector<T, idx_t> x1,
+                     Vector<T, idx_t> x2,
+                     Vector<T, idx_t> x3,
+                     TC c1,
+                     TS s1,
+                     TC c2,
+                     TS s2)
+    {
+        assert(x1.size() == x2.size() && x2.size() == x3.size());
+        const idx_t n = x1.size();
+        for (idx_t i = 0; i < n; ++i) {
+            T x2_g1 = c1 * x2[i] + s1 * x3[i];
+            x3[i] = -conj(s1) * x2[i] + c1 * x3[i];
+            x2[i] = -conj(s2) * x1[i] + c2 * x2_g1;
+            x1[i] = c2 * x1[i] + s2 * x2_g1;
+        }
+    }
+
+    /** Applies four plane rotations to four vectors.
+     *
+     * [x1]   [1 0   0  0]   [1 0 0   0 ]   [c2  s2 0 0]   [1 0   0  0]   [x1]
+     * [x2] = [0 c4  s4 0] * [0 1 0   0 ] * [-s2 c2 0 0] * [0 c1  s1 0] * [x2]
+     * [x3]   [0 -s4 c4 0]   [0 0 c3  s3]   [0   0  1 0]   [0 -s1 c1 0]   [x3]
+     * [x4]   [0 0   0  1]   [0 0 -s3 c3]   [0   0  0 1]   [0 0   0  0]   [x4]
+     *
+     * Note: the rotations are applied in the order G1, G2, G3, G4,
+     * but the order G1, G3, G2, G4 is also possible.
+     *
+     * @param x1  The first vector.
+     * @param x2  The second vector.
+     * @param x3  The third vector.
+     * @param x4  The fourth vector.
+     * @param c1  The cosine of the first rotation.
+     * @param s1  The sine of the first rotation.
+     * @param c2  The cosine of the second rotation.
+     * @param s2  The sine of the second rotation.
+     * @param c3  The cosine of the third rotation.
+     * @param s3  The sine of the third rotation.
+     * @param c4  The cosine of the fourth rotation.
+     * @param s4  The sine of the fourth rotation.
+     */
+    template <typename T, typename TC, typename TS, typename idx_t>
+    void rot_fuse2x2(Vector<T, idx_t> x1,
+                     Vector<T, idx_t> x2,
+                     Vector<T, idx_t> x3,
+                     Vector<T, idx_t> x4,
+                     TC c1,
+                     TS s1,
+                     TC c2,
+                     TS s2,
+                     TC c3,
+                     TS s3,
+                     TC c4,
+                     TS s4)
+    {
+        assert(x1.size() == x2.size() && x1.size() == x3.size() &&
+               x1.size() == x4.size());
+        const idx_t n = x1.size();
+        for (idx_t i = 0; i < n; ++i) {
+            T x2_g1 = c1 * x2[i] + s1 * x3[i];
+            T x3_g1 = -conj(s1) * x2[i] + c1 * x3[i];
+            T x2_g2 = -conj(s2) * x1[i] + c2 * x2_g1;
+            x1[i] = c2 * x1[i] + s2 * x2_g1;
+            T x3_g3 = c3 * x3_g1 + s3 * x4[i];
+            x4[i] = -conj(s3) * x3_g1 + c3 * x4[i];
+            x2[i] = c4 * x2_g2 + s4 * x3_g3;
+            x3[i] = -conj(s4) * x2_g2 + c4 * x3_g3;
+        }
+    }
+
+    // Kernel for lasr3, forward left variant
+    template <typename T,
+              typename TC,
+              typename TS,
+              Layout layout,
+              typename idx_t>
+    void lasr3_kernel_forward_left(ConstMatrix<TC, layout, idx_t> C,
+                                   ConstMatrix<TS, layout, idx_t> S,
+                                   Matrix<T, layout, idx_t> A,
+                                   MemoryBlock<T, idx_t, true> work)
+    {
+        idx_t m = A.num_rows();
+        idx_t n = A.num_columns();
+        idx_t k = C.num_columns();
+
+        assert(C.num_rows() == S.num_rows());
+        assert(C.num_columns() == S.num_columns());
+        assert(m == C.num_rows() + 1);
+        assert(m > k + 1);
+
+        // Number of rows that will be stored in the packed workspace matrix
+        const idx_t np = k + 2;
+        // During the algorithm, instead of applying the rotations directly to
+        // the matrix A, we will apply them to a packed version of A. This is
+        // done to allow for efficient vectorization of the rotations. The
+        // packed matrix is stored in the workspace array. Since we apply the
+        // rotations from the left, this packed matrix is always row-major
+        // regardless of the layout of A.
+        Matrix<T, Layout::RowMajor, idx_t> A_pack(np, n, work);
+        // A_pack will store np rows of A.
+        // To avoid having to shuffle rows in A, we keep track of two indices
+        // For i = i_pack2, ..., i_pack2 + np - 1, row (i_pack + i - i_pack2 +
+        // np) % np of A_pack corresponds to row i of A.
+        idx_t i_pack = 0;
+        idx_t i_pack2 = 0;
+
+        // Copy the first np rows of A to A_pack
+        for (idx_t i = 0; i < np; ++i) {
+            for (idx_t j = 0; j < n; ++j) {
+                A_pack(i, j) = A(i, j);
+            }
+        }
+
+        // Startup phase
+        for (idx_t j = 0; j + 1 < k; ++j) {
+            for (idx_t i = 0, g = j; i < j + 1; ++i, --g) {
+                rot(A_pack.row(g), A_pack.row(g + 1), C(g, i), S(g, i));
+            }
+        }
+
+        // Pipeline phase
+        for (idx_t j = k - 1; j + 1 < m - 1; j += 2) {
+            for (idx_t i = 0, g = j; i + 1 < k; i += 2, g -= 2) {
+                auto a1 = A_pack.row((g - 1 + i_pack - i_pack2 + np) % np);
+                auto a2 = A_pack.row((g + i_pack - i_pack2 + np) % np);
+                auto a3 = A_pack.row((g + 1 + i_pack - i_pack2 + np) % np);
+                auto a4 = A_pack.row((g + 2 + i_pack - i_pack2 + np) % np);
+
+                rot_fuse2x2(a1, a2, a3, a4, C(g, i), S(g, i), C(g - 1, i + 1),
+                            S(g - 1, i + 1), C(g + 1, i), S(g + 1, i),
+                            C(g, i + 1), S(g, i + 1));
+            }
+            if (k % 2 == 1) {
+                // k is odd, so there are two more rotations to apply
+                idx_t i = k - 1;
+                idx_t g = j - i;
+
+                auto a1 = A_pack.row((g + i_pack - i_pack2 + np) % np);
+                auto a2 = A_pack.row((g + 1 + i_pack - i_pack2 + np) % np);
+                auto a3 = A_pack.row((g + 2 + i_pack - i_pack2 + np) % np);
+
+                rot_fuse2x1(a1, a2, a3, C(g, i), S(g, i), C(g + 1, i),
+                            S(g + 1, i));
+            }
+            // rows i_pack and i_pack+1 of A_pack are finished, copy them back
+            // to A
+            for (idx_t i = 0; i < n; i++) {
+                A(i_pack2, i) = A_pack(i_pack, i);
+                A(i_pack2 + 1, i) = A_pack((i_pack + 1) % np, i);
+            }
+            // Pack next rows and update i_pack and i_pack2
+            if (j + 4 < m) {
+                for (idx_t i = 0; i < n; i++) {
+                    A_pack(i_pack, i) = A(j + 3, i);
+                    A_pack((i_pack + 1) % np, i) = A(j + 4, i);
+                }
+                i_pack = (i_pack + 2) % np;
+                i_pack2 += 2;
+            }
+            else if (j + 3 < m) {
+                for (idx_t i = 0; i < n; i++) {
+                    A_pack(i_pack, i) = A(j + 3, i);
+                }
+                i_pack = (i_pack + 1) % np;
+                i_pack2 += 1;
+            }
+        }
+
+        // Shutdown phase
+        for (idx_t j = (m - k + 1) % 2; j < k; ++j) {
+            for (idx_t i = j, g = m - 2; i < k; ++i, --g) {
+                auto a1 = A_pack.row((g + i_pack - i_pack2 + np) % np);
+                auto a2 = A_pack.row((g + 1 + i_pack - i_pack2 + np) % np);
+                rot(a1, a2, C(g, i), S(g, i));
+            }
+        }
+
+        // Copy the last np rows of A_pack back to A
+        for (idx_t i = 0; i < std::min(np, m - i_pack2); ++i) {
+            for (idx_t j = 0; j < n; ++j) {
+                A(i_pack2 + i, j) = A_pack((i_pack + i) % np, j);
+            }
+        }
+    }
+
+    // Kernel for lasr3, backward left variant
+    template <typename T,
+              typename TC,
+              typename TS,
+              Layout layout,
+              typename idx_t>
+    void lasr3_kernel_backward_left(ConstMatrix<TC, layout, idx_t> C,
+                                    ConstMatrix<TS, layout, idx_t> S,
+                                    Matrix<T, layout, idx_t> A,
+                                    MemoryBlock<T, idx_t, true> work)
+    {
+        idx_t m = A.num_rows();
+        idx_t n = A.num_columns();
+        idx_t k = C.num_columns();
+
+        assert(C.num_rows() == S.num_rows());
+        assert(C.num_columns() == S.num_columns());
+        assert(m == C.num_rows() + 1);
+        assert(m > k + 1);
+
+        // Number of rows that will be stored in the packed workspace matrix
+        const idx_t np = k + 2;
+        // During the algorithm, instead of applying the rotations directly to
+        // the matrix A, we will apply them to a packed version of A. This is
+        // done to allow for efficient vectorization of the rotations. The
+        // packed matrix is stored in the workspace array. Since we apply the
+        // rotations from the left, this packed matrix is always row-major
+        // regardless of the layout of A.
+        Matrix<T, Layout::RowMajor, idx_t> A_pack(np, n, work);
+        // A_pack will store np rows of A.
+        // To avoid having to shuffle rows in A, we keep track of two indices
+        // For i = i_pack2, ..., i_pack2 + np - 1, row (i_pack + i - i_pack2 +
+        // np) % np of A_pack corresponds to row i of A.
+        idx_t i_pack = 0;
+        idx_t i_pack2 = m - np;
+
+        // Copy the last np rows of A to A_pack
+        for (idx_t i = 0; i < np; ++i) {
+            for (idx_t j = 0; j < n; ++j) {
+                A_pack(i, j) = A(m - np + i, j);
+            }
+        }
+
+        // Startup phase
+        for (idx_t j = 0; j + 1 < k; ++j) {
+            for (idx_t i = 0, g = m - 2 - j; i < j + 1; ++i, ++g) {
+                rot(A_pack.row((g - i_pack2) % np),
+                    A_pack.row((g + 1 - i_pack2) % np), C(g, i), S(g, i));
+            }
+        }
+
+        // Pipeline phase
+        for (idx_t j = k - 1; j + 1 < m - 1; j += 2) {
+            for (idx_t i = 0, g = m - 2 - j; i + 1 < k; i += 2, g += 2) {
+                auto a1 = A_pack.row((g - 1 + i_pack - i_pack2) % np);
+                auto a2 = A_pack.row((g + i_pack - i_pack2) % np);
+                auto a3 = A_pack.row((g + 1 + i_pack - i_pack2) % np);
+                auto a4 = A_pack.row((g + 2 + i_pack - i_pack2) % np);
+
+                rot_fuse2x2(a1, a2, a3, a4, C(g, i), S(g, i), C(g - 1, i),
+                            S(g - 1, i), C(g + 1, i + 1), S(g + 1, i + 1),
+                            C(g, i + 1), S(g, i + 1));
+            }
+            if (k % 2 == 1) {
+                // k is odd, so there are two more rotations to apply
+                idx_t i = k - 1;
+                idx_t g = m - 2 - j + i;
+
+                auto a1 = A_pack.row((g - 1 + i_pack - i_pack2 + np) % np);
+                auto a2 = A_pack.row((g + i_pack - i_pack2 + np) % np);
+                auto a3 = A_pack.row((g + 1 + i_pack - i_pack2 + np) % np);
+
+                rot_fuse1x2(a1, a2, a3, C(g, i), S(g, i), C(g - 1, i),
+                            S(g - 1, i));
+            }
+            // rows i_pack+np-2 and i_pack+np-1 are finished, copy them back to
+            // A
+            for (idx_t i = 0; i < n; i++) {
+                A(i_pack2 + np - 2, i) = A_pack((i_pack + np - 2) % np, i);
+                A(i_pack2 + np - 1, i) = A_pack((i_pack + np - 1) % np, i);
+            }
+            // Pack next rows and update i_pack and i_pack2
+            if (i_pack2 > 1) {
+                for (idx_t i = 0; i < n; i++) {
+                    A_pack((i_pack + np - 2) % np, i) = A(i_pack2 - 2, i);
+                    A_pack((i_pack + np - 1) % np, i) = A(i_pack2 - 1, i);
+                }
+                i_pack = (i_pack + np - 2) % np;
+                i_pack2 -= 2;
+            }
+            else if (i_pack2 > 0) {
+                for (idx_t i = 0; i < n; i++) {
+                    A_pack((i_pack + np - 1) % np, i) = A(i_pack2 - 1, i);
+                }
+                i_pack = (i_pack + np - 1) % np;
+                i_pack2 -= 1;
+            }
+        }
+
+        // By now, i_pack2 should point to the start of the matrix.
+        assert(i_pack2 == 0);
+
+        // Shutdown phase
+        for (idx_t j = (m - k + 1) % 2; j < k; ++j) {
+            for (idx_t i = j, g = 0; i < k; ++i, ++g) {
+                auto a1 = A_pack.row((g + i_pack - i_pack2 + np) % np);
+                auto a2 = A_pack.row((g + 1 + i_pack - i_pack2 + np) % np);
+                rot(a1, a2, C(g, i), S(g, i));
+            }
+        }
+
+        // Copy the first np rows of A_pack back to A
+        for (idx_t i = 0; i < np; ++i) {
+            for (idx_t j = 0; j < n; ++j) {
+                A(i, j) = A_pack((i_pack + i) % np, j);
+            }
+        }
+    }
+
+    // Kernel for lasr3, forward right variant
+    template <typename T,
+              typename TC,
+              typename TS,
+              Layout layout,
+              typename idx_t>
+    void lasr3_kernel_forward_right(ConstMatrix<TC, layout, idx_t> C,
+                                    ConstMatrix<TS, layout, idx_t> S,
+                                    Matrix<T, layout, idx_t> A,
+                                    MemoryBlock<T, idx_t, true> work)
+    {
+        idx_t m = A.num_rows();
+        idx_t n = A.num_columns();
+        idx_t k = C.num_columns();
+
+        assert(C.num_rows() == S.num_rows());
+        assert(C.num_columns() == S.num_columns());
+        assert(n == C.num_rows() + 1);
+        assert(n > k + 1);
+
+        // Number of columns that will be stored in the packed workspace matrix
+        const idx_t np = k + 2;
+        // During the algorithm, instead of applying the rotations directly to
+        // the matrix A, we will apply them to a packed version of A. This is
+        // done to allow for efficient vectorization of the rotations. The
+        // packed matrix is stored in the workspace array. Since we apply the
+        // rotations from the right, this packed matrix is always column-major
+        // regardless of the layout of A.
+        Matrix<T, Layout::ColMajor, idx_t> A_pack(m, np, work);
+        // A_pack will store np columns of A.
+        // To avoid having to shuffle columns in A, we keep track of two indices
+        // For i = i_pack2, ..., i_pack2 + np - 1, column (i_pack + i - i_pack2
+        // + np) % np of A_pack corresponds to column i of A.
+        idx_t i_pack = 0;
+        idx_t i_pack2 = 0;
+
+        // Copy the first np columns of A to A_pack
+        for (idx_t j = 0; j < np; ++j) {
+            for (idx_t i = 0; i < m; ++i) {
+                A_pack(i, j) = A(i, j);
+            }
+        }
+
+        // Startup phase
+        for (idx_t j = 0; j + 1 < k; ++j) {
+            for (idx_t i = 0, g = j; i < j + 1; ++i, --g) {
+                rot(A_pack.column(g), A_pack.column(g + 1), C(g, i),
+                    conj(S(g, i)));
+            }
+        }
+
+        // Pipeline phase
+        for (idx_t j = k - 1; j + 1 < n - 1; j += 2) {
+            for (idx_t i = 0, g = j; i + 1 < k; i += 2, g -= 2) {
+                auto a1 = A_pack.column((g - 1 + i_pack - i_pack2 + np) % np);
+                auto a2 = A_pack.column((g + i_pack - i_pack2 + np) % np);
+                auto a3 = A_pack.column((g + 1 + i_pack - i_pack2 + np) % np);
+                auto a4 = A_pack.column((g + 2 + i_pack - i_pack2 + np) % np);
+
+                rot_fuse2x2(a1, a2, a3, a4, C(g, i), conj(S(g, i)),
+                            C(g - 1, i + 1), conj(S(g - 1, i + 1)), C(g + 1, i),
+                            conj(S(g + 1, i)), C(g, i + 1), conj(S(g, i + 1)));
+            }
+            if (k % 2 == 1) {
+                // k is odd, so there are two more rotations to apply
+                idx_t i = k - 1;
+                idx_t g = j - i;
+
+                auto a1 = A_pack.column((g + i_pack - i_pack2 + np) % np);
+                auto a2 = A_pack.column((g + 1 + i_pack - i_pack2 + np) % np);
+                auto a3 = A_pack.column((g + 2 + i_pack - i_pack2 + np) % np);
+
+                rot_fuse2x1(a1, a2, a3, C(g, i), conj(S(g, i)), C(g + 1, i),
+                            conj(S(g + 1, i)));
+            }
+            // columns i_pack and i_pack+1 of A_pack are finished, copy them
+            // back to A
+            for (idx_t i = 0; i < m; i++) {
+                A(i, i_pack2) = A_pack(i, i_pack);
+                A(i, i_pack2 + 1) = A_pack(i, (i_pack + 1) % np);
+            }
+            // Pack next columns and update i_pack and i_pack2
+            if (j + 4 < n) {
+                for (idx_t i = 0; i < m; i++) {
+                    A_pack(i, i_pack) = A(i, j + 3);
+                    A_pack(i, (i_pack + 1) % np) = A(i, j + 4);
+                }
+                i_pack = (i_pack + 2) % np;
+                i_pack2 += 2;
+            }
+            else if (j + 3 < n) {
+                for (idx_t i = 0; i < m; i++) {
+                    A_pack(i, i_pack) = A(i, j + 3);
+                }
+                i_pack = (i_pack + 1) % np;
+                i_pack2 += 1;
+            }
+        }
+
+        // Shutdown phase
+        for (idx_t j = (n - k + 1) % 2; j < k; ++j) {
+            for (idx_t i = j, g = n - 2; i < k; ++i, --g) {
+                auto a1 = A_pack.column((g + i_pack - i_pack2 + np) % np);
+                auto a2 = A_pack.column((g + 1 + i_pack - i_pack2 + np) % np);
+                rot(a1, a2, C(g, i), conj(S(g, i)));
+            }
+        }
+
+        // Copy the last np columns of A_pack back to A
+        for (idx_t i = 0; i < std::min(np, n - i_pack2); ++i) {
+            for (idx_t j = 0; j < m; ++j) {
+                A(j, i_pack2 + i) = A_pack(j, (i_pack + i) % np);
+            }
+        }
+    }
+
+    // Kernel for lasr3, backward right variant
+    template <typename T,
+              typename TC,
+              typename TS,
+              Layout layout,
+              typename idx_t>
+    void lasr3_kernel_backward_right(ConstMatrix<TC, layout, idx_t> C,
+                                     ConstMatrix<TS, layout, idx_t> S,
+                                     Matrix<T, layout, idx_t> A,
+                                     MemoryBlock<T, idx_t, true> work)
+    {
+        idx_t m = A.num_rows();
+        idx_t n = A.num_columns();
+        idx_t k = C.num_columns();
+
+        assert(C.num_rows() == S.num_rows());
+        assert(C.num_columns() == S.num_columns());
+        assert(n == C.num_rows() + 1);
+        assert(n > k + 1);
+
+        // Number of columns that will be stored in the packed workspace matrix
+        const idx_t np = k + 2;
+        // During the algorithm, instead of applying the rotations directly to
+        // the matrix A, we will apply them to a packed version of A. This is
+        // done to allow for efficient vectorization of the rotations. The
+        // packed matrix is stored in the workspace array. Since we apply the
+        // rotations from the right, this packed matrix is always column-major
+        // regardless of the layout of A.
+        Matrix<T, Layout::ColMajor, idx_t> A_pack(m, np, work);
+        // A_pack will store np columns of A.
+        // To avoid having to shuffle columns in A, we keep track of two indices
+        // For i = i_pack2, ..., i_pack2 + np - 1, column (i_pack + i - i_pack2
+        // + np) % np of A_pack corresponds to column i of A.
+        idx_t i_pack = 0;
+        idx_t i_pack2 = n - np;
+
+        // Copy the last np columns of A to A_pack
+        for (idx_t i = 0; i < m; ++i) {
+            for (idx_t j = 0; j < np; ++j) {
+                A_pack(i, j) = A(i, n - np + j);
+            }
+        }
+
+        // Startup phase
+        for (idx_t j = 0; j + 1 < k; ++j) {
+            for (idx_t i = 0, g = n - 2 - j; i < j + 1; ++i, ++g) {
+                rot(A_pack.column((g - i_pack2) % np),
+                    A_pack.column((g + 1 - i_pack2) % np), C(g, i),
+                    conj(S(g, i)));
+            }
+        }
+
+        // Pipeline phase
+        for (idx_t j = k - 1; j + 1 < n - 1; j += 2) {
+            for (idx_t i = 0, g = n - 2 - j; i + 1 < k; i += 2, g += 2) {
+                auto a1 = A_pack.column((g - 1 + i_pack - i_pack2) % np);
+                auto a2 = A_pack.column((g + i_pack - i_pack2) % np);
+                auto a3 = A_pack.column((g + 1 + i_pack - i_pack2) % np);
+                auto a4 = A_pack.column((g + 2 + i_pack - i_pack2) % np);
+
+                rot_fuse2x2(a1, a2, a3, a4, C(g, i), conj(S(g, i)), C(g - 1, i),
+                            conj(S(g - 1, i)), C(g + 1, i + 1),
+                            conj(S(g + 1, i + 1)), C(g, i + 1),
+                            conj(S(g, i + 1)));
+            }
+            if (k % 2 == 1) {
+                // k is odd, so there are two more rotations to apply
+                idx_t i = k - 1;
+                idx_t g = n - 2 - j + i;
+
+                auto a1 = A_pack.column((g - 1 + i_pack - i_pack2 + np) % np);
+                auto a2 = A_pack.column((g + i_pack - i_pack2 + np) % np);
+                auto a3 = A_pack.column((g + 1 + i_pack - i_pack2 + np) % np);
+
+                rot_fuse1x2(a1, a2, a3, C(g, i), conj(S(g, i)), C(g - 1, i),
+                            conj(S(g - 1, i)));
+            }
+            // columns i_pack+np-2 and i_pack+np-1 are finished, copy them back
+            // to A
+            for (idx_t i = 0; i < m; i++) {
+                A(i, i_pack2 + np - 2) = A_pack(i, (i_pack + np - 2) % np);
+                A(i, i_pack2 + np - 1) = A_pack(i, (i_pack + np - 1) % np);
+            }
+            // Pack next rows and update i_pack and i_pack2
+            if (i_pack2 > 1) {
+                for (idx_t i = 0; i < m; i++) {
+                    A_pack(i, (i_pack + np - 2) % np) = A(i, i_pack2 - 2);
+                    A_pack(i, (i_pack + np - 1) % np) = A(i, i_pack2 - 1);
+                }
+                i_pack = (i_pack + np - 2) % np;
+                i_pack2 -= 2;
+            }
+            else if (i_pack2 > 0) {
+                for (idx_t i = 0; i < m; i++) {
+                    A_pack(i, (i_pack + np - 1) % np) = A(i, i_pack2 - 1);
+                }
+                i_pack = (i_pack + np - 1) % np;
+                i_pack2 -= 1;
+            }
+        }
+
+        // By now, i_pack2 should point to the start of the matrix.
+        assert(i_pack2 == 0);
+
+        // Shutdown phase
+        for (idx_t j = (n - k + 1) % 2; j < k; ++j) {
+            for (idx_t i = j, g = 0; i < k; ++i, ++g) {
+                auto a1 = A_pack.column((g + i_pack - i_pack2 + np) % np);
+                auto a2 = A_pack.column((g + 1 + i_pack - i_pack2 + np) % np);
+                rot(a1, a2, C(g, i), conj(S(g, i)));
+            }
+        }
+
+        // Copy the first np columns of A_pack back to A
+        for (idx_t i = 0; i < m; ++i) {
+            for (idx_t j = 0; j < np; ++j) {
+                A(i, j) = A_pack(i, (i_pack + j) % np);
+            }
+        }
+    }
+}  // namespace internal
+
+/**
+ * @copydoc lasr3
+ */
+template <typename T, typename TC, typename TS, Layout layout, typename idx_t>
+void lasr3(Side side,
+           Direction direct,
+           ConstMatrix<TC, layout, idx_t> C,
+           ConstMatrix<TS, layout, idx_t> S,
+           Matrix<T, layout, idx_t> A,
+           MemoryBlock<T, idx_t, true> work)
+{
+    const idx_t m = A.num_rows();
+    const idx_t n = A.num_columns();
+    const idx_t k = C.num_columns();
+    const idx_t num_rot = C.num_rows();
+
+    assert(side == Side::Left || side == Side::Right);
+    assert(direct == Direction::Forward || direct == Direction::Backward);
+    assert(C.num_rows() == S.num_rows());
+    assert(C.num_columns() == S.num_columns());
+    assert(side == Side::Left ? m : n == C.num_rows() + 1);
+    assert(work.size() >= lasr3_workquery(side, direct, C, S, A));
+
+    // Blocksize for the number of rows/columns of A to be processed at once
+    const idx_t block_mn = 256;
+    // Blocksize for the number of rotation sequences to be processed at once
+    const idx_t block_k = 64;
+
+    if (num_rot < 4) {
+        // If there are less than 4 rotations, we can't really pipeline,
+        // just use standard loops
+        if (side == Side::Left) {
+            if (direct == Direction::Forward) {
+                for (idx_t i = 0; i < k; ++i)
+                    for (idx_t j = 0; j < num_rot; ++j)
+                        rot(A.row(j), A.row(j + 1), C(j, i), S(j, i));
+            }
+            else {
+                for (idx_t i = 0; i < k; ++i)
+                    for (idx_t j = num_rot - 1; j >= 0; --j)
+                        rot(A.row(j), A.row(j + 1), C(j, i), S(j, i));
+            }
+        }
+        else {
+            if (direct == Direction::Forward) {
+                for (idx_t i = 0; i < k; ++i)
+                    for (idx_t j = 0; j < num_rot; ++j)
+                        rot(A.column(j), A.column(j + 1), C(j, i),
+                            conj(S(j, i)));
+            }
+            else {
+                for (idx_t i = 0; i < k; ++i)
+                    for (idx_t j = num_rot - 1; j >= 0; --j)
+                        rot(A.column(j), A.column(j + 1), C(j, i),
+                            conj(S(j, i)));
+            }
+        }
+        return;
+    }
+
+    // We can only pipeline if num_rot > k + 2
+    // And for good performance, we want an even larger ratio
+    const idx_t block_k2 = std::min(num_rot - 3, block_k);
+
+    if (side == Side::Left and direct == Direction::Forward) {
+        for (idx_t n_b = 0; n_b < n; n_b += block_mn) {
+            auto A2 = A.submatrix(0, m, n_b, std::min(n, n_b + block_mn));
+            for (idx_t k_b = 0; k_b < k; k_b += block_k2) {
+                auto C2 = C.submatrix(0, C.num_rows(), k_b,
+                                      std::min(k, k_b + block_k2));
+                auto S2 = S.submatrix(0, S.num_rows(), k_b,
+                                      std::min(k, k_b + block_k2));
+                internal::lasr3_kernel_forward_left(C2, S2, A2, work);
+            }
+        }
+    }
+
+    if (side == Side::Left and direct == Direction::Backward) {
+        for (idx_t n_b = 0; n_b < n; n_b += block_mn) {
+            auto A2 = A.submatrix(0, m, n_b, std::min(n, n_b + block_mn));
+            for (idx_t k_b = 0; k_b < k; k_b += block_k2) {
+                auto C2 = C.submatrix(0, C.num_rows(), k_b,
+                                      std::min(k, k_b + block_k2));
+                auto S2 = S.submatrix(0, S.num_rows(), k_b,
+                                      std::min(k, k_b + block_k2));
+                internal::lasr3_kernel_backward_left(C2, S2, A2, work);
+            }
+        }
+    }
+
+    if (side == Side::Right and direct == Direction::Forward) {
+        for (idx_t m_b = 0; m_b < m; m_b += block_mn) {
+            auto A2 = A.submatrix(m_b, std::min(m, m_b + block_mn), 0, n);
+            for (idx_t k_b = 0; k_b < k; k_b += block_k2) {
+                auto C2 = C.submatrix(0, C.num_rows(), k_b,
+                                      std::min(k, k_b + block_k2));
+                auto S2 = S.submatrix(0, S.num_rows(), k_b,
+                                      std::min(k, k_b + block_k2));
+                internal::lasr3_kernel_forward_right(C2, S2, A2, work);
+            }
+        }
+    }
+
+    if (side == Side::Right and direct == Direction::Backward) {
+        for (idx_t m_b = 0; m_b < m; m_b += block_mn) {
+            auto A2 = A.submatrix(m_b, std::min(m, m_b + block_mn), 0, n);
+            for (idx_t k_b = 0; k_b < k; k_b += block_k2) {
+                auto C2 = C.submatrix(0, C.num_rows(), k_b,
+                                      std::min(k, k_b + block_k2));
+                auto S2 = S.submatrix(0, S.num_rows(), k_b,
+                                      std::min(k, k_b + block_k2));
+                internal::lasr3_kernel_backward_right(C2, S2, A2, work);
+            }
+        }
+    }
+}
+
+}  // namespace lapack_cpp
+
+#endif  // LAPACK_CPP_LASR3_HPP
\ No newline at end of file
diff --git a/LAPACK_CPP/include/lapack_cpp/utils.hpp b/LAPACK_CPP/include/lapack_cpp/utils.hpp
new file mode 100644
index 000000000..8276c3e16
--- /dev/null
+++ b/LAPACK_CPP/include/lapack_cpp/utils.hpp
@@ -0,0 +1,87 @@
+#ifndef LAPACK_CPP_UTILS_HPP
+#define LAPACK_CPP_UTILS_HPP
+
+#include <iostream>
+#include <type_traits>
+
+#include "lapack_cpp/base.hpp"
+
+namespace lapack_cpp {
+
+// Code for printing
+template <typename T, typename idx_t>
+void print(const Vector<T, idx_t>& v)
+{
+    std::cout << "(" << v.size() << ")[" << std::endl;
+    for (idx_t i = 0; i < v.size(); ++i) {
+        std::cout << v[i] << std::endl;
+    }
+    std::cout << "]" << std::endl;
+}
+
+// Code for printing
+template <typename T, Layout layout, typename idx_t>
+void print(const Matrix<T, layout, idx_t>& m)
+{
+    std::cout << "(" << m.num_rows() << "," << m.num_columns() << ")["
+              << std::endl;
+    for (idx_t i = 0; i < m.num_rows(); ++i) {
+        std::cout << "[";
+        for (idx_t j = 0; j < m.num_columns() - 1; ++j) {
+            std::cout << m(i, j) << ",";
+        }
+        std::cout << m(i, m.num_columns() - 1) << "]" << std::endl;
+    }
+    std::cout << "]" << std::endl;
+}
+
+// Code for printing
+template <typename T, Layout layout, typename idx_t>
+void print(const ConstMatrix<T, layout, idx_t>& m)
+{
+    std::cout << "(" << m.num_rows() << "," << m.num_columns() << ")["
+              << std::endl;
+    for (idx_t i = 0; i < m.num_rows(); ++i) {
+        std::cout << "[";
+        for (idx_t j = 0; j < m.num_columns() - 1; ++j) {
+            std::cout << m(i, j) << ",";
+        }
+        std::cout << m(i, m.num_columns() - 1) << "]" << std::endl;
+    }
+    std::cout << "]" << std::endl;
+}
+
+// Initialize a matrix with random values
+template <typename T, Layout layout, typename idx_t>
+void randomize(Matrix<T, layout, idx_t>& m)
+{
+    for (idx_t j = 0; j < m.num_columns(); ++j) {
+        for (idx_t i = 0; i < m.num_rows(); ++i) {
+            m(i, j) = (T)rand() / (T)RAND_MAX;
+        }
+    }
+}
+
+template <typename T, Layout layout, typename idx_t>
+void randomize(Matrix<std::complex<T>, layout, idx_t>& m)
+{
+    for (idx_t j = 0; j < m.num_columns(); ++j) {
+        for (idx_t i = 0; i < m.num_rows(); ++i) {
+            m(i, j) = std::complex<T>((T)rand() / (T)RAND_MAX,
+                                      (T)rand() / (T)RAND_MAX);
+        }
+    }
+}
+
+// Initialize a vector with random values
+template <typename T, typename idx_t>
+void randomize(Vector<T, idx_t>& v)
+{
+    for (idx_t i = 0; i < v.size(); ++i) {
+        v[i] = (T)rand() / (T)RAND_MAX;
+    }
+}
+
+}  // namespace lapack_cpp
+
+#endif
\ No newline at end of file
diff --git a/LAPACK_CPP/src/lapack_c/lartg_c.f90 b/LAPACK_CPP/src/lapack_c/lartg_c.f90
new file mode 100644
index 000000000..6e88b07f5
--- /dev/null
+++ b/LAPACK_CPP/src/lapack_c/lartg_c.f90
@@ -0,0 +1,53 @@
+subroutine lapack_c_slartg(f, g, c, s, r) bind(c, name='lapack_c_slartg')
+    use iso_c_binding
+    implicit none
+    real(kind=c_float), intent(in), value :: f, g
+    real(kind=c_float), intent(out) :: c, s, r
+
+    external slartg
+
+    ! Forward call to fortran implementation
+    call slartg( f, g, c, s, r )
+
+end subroutine lapack_c_slartg
+
+subroutine lapack_c_dlartg(f, g, c, s, r) bind(c, name='lapack_c_dlartg')
+    use iso_c_binding
+    implicit none
+    real(kind=c_double), intent(in), value :: f, g
+    real(kind=c_double), intent(out) :: c, s, r
+
+    external dlartg
+
+    ! Forward call to fortran implementation
+    call dlartg( f, g, c, s, r )
+
+end subroutine lapack_c_dlartg
+
+subroutine lapack_c_clartg(f, g, c, s, r) bind(c, name='lapack_c_clartg')
+    use iso_c_binding
+    implicit none
+    complex(kind=c_float_complex), intent(in), value :: f, g
+    complex(kind=c_float_complex), intent(out) :: s, r
+    real(kind=c_float), intent(out) :: c
+
+    external clartg
+
+    ! Forward call to fortran implementation
+    call clartg( f, g, c, s, r )
+
+end subroutine lapack_c_clartg
+
+subroutine lapack_c_zlartg(f, g, c, s, r) bind(c, name='lapack_c_zlartg')
+    use iso_c_binding
+    implicit none
+    complex(kind=c_double_complex), intent(in), value :: f, g
+    complex(kind=c_double_complex), intent(out) :: s, r
+    real(kind=c_double), intent(out) :: c
+
+    external zlartg
+
+    ! Forward call to fortran implementation
+    call zlartg( f, g, c, s, r )
+
+end subroutine lapack_c_zlartg
\ No newline at end of file
diff --git a/LAPACK_CPP/src/lapack_c/lasr3_c.cpp b/LAPACK_CPP/src/lapack_c/lasr3_c.cpp
new file mode 100644
index 000000000..344703e4e
--- /dev/null
+++ b/LAPACK_CPP/src/lapack_c/lasr3_c.cpp
@@ -0,0 +1,206 @@
+#include "lapack_c/lasr3.h"
+#include "lapack_cpp/base.hpp"
+#include "lapack_cpp/lapack/lasr3.hpp"
+
+using namespace lapack_cpp;
+
+// Actual definitions
+extern "C" {
+
+void slasr3(char side,
+            char direct,
+            lapack_idx m,
+            lapack_idx n,
+            lapack_idx k,
+            const float* C,
+            lapack_idx ldc,
+            const float* S,
+            lapack_idx lds,
+            float* A,
+            lapack_idx lda,
+            float* work,
+            lapack_idx lwork)
+{
+    Side side_ = char2side(side);
+    Direction direct_ = char2direct(direct);
+    const lapack_idx n_rot = side_ == Side::Left ? m - 1 : n - 1;
+
+    ConstMatrix<float, Layout::ColMajor, lapack_idx> C_(n_rot, k, C, ldc);
+    ConstMatrix<float, Layout::ColMajor, lapack_idx> S_(n_rot, k, S, lds);
+    Matrix<float, Layout::ColMajor, lapack_idx> A_(m, n, A, lda);
+
+    if (lwork == -1) {
+        lapack_idx lwork_ = lasr3_workquery(side_, direct_, C_, S_, A_);
+        *work = lwork_;
+        return;
+    }
+
+    MemoryBlock<float, lapack_idx> work_(lwork, work);
+    lasr3(side_, direct_, C_, S_, A_, work_);
+}
+
+void dlasr3(char side,
+            char direct,
+            lapack_idx m,
+            lapack_idx n,
+            lapack_idx k,
+            const double* C,
+            lapack_idx ldc,
+            const double* S,
+            lapack_idx lds,
+            double* A,
+            lapack_idx lda,
+            double* work,
+            lapack_idx lwork)
+{
+    Side side_ = char2side(side);
+    Direction direct_ = char2direct(direct);
+    const lapack_idx n_rot = side_ == Side::Left ? m - 1 : n - 1;
+
+    ConstMatrix<double, Layout::ColMajor, lapack_idx> C_(n_rot, k, C, ldc);
+    ConstMatrix<double, Layout::ColMajor, lapack_idx> S_(n_rot, k, S, lds);
+    Matrix<double, Layout::ColMajor, lapack_idx> A_(m, n, A, lda);
+
+    if (lwork == -1) {
+        lapack_idx lwork_ = lasr3_workquery(side_, direct_, C_, S_, A_);
+        *work = lwork_;
+        return;
+    }
+
+    MemoryBlock<double, lapack_idx> work_(lwork, work);
+    lasr3(side_, direct_, C_, S_, A_, work_);
+}
+
+void clasr3(char side,
+            char direct,
+            lapack_idx m,
+            lapack_idx n,
+            lapack_idx k,
+            const float* C,
+            lapack_idx ldc,
+            const lapack_float_complex* S,
+            lapack_idx lds,
+            lapack_float_complex* A,
+            lapack_idx lda,
+            lapack_float_complex* work,
+            lapack_idx lwork)
+{
+    Side side_ = char2side(side);
+    Direction direct_ = char2direct(direct);
+    const lapack_idx n_rot = side_ == Side::Left ? m - 1 : n - 1;
+
+    ConstMatrix<float, Layout::ColMajor, lapack_idx> C_(n_rot, k, C, ldc);
+    ConstMatrix<lapack_float_complex, Layout::ColMajor, lapack_idx> S_(n_rot, k,
+                                                                       S, lds);
+    Matrix<lapack_float_complex, Layout::ColMajor, lapack_idx> A_(m, n, A, lda);
+
+    if (lwork == -1) {
+        lapack_idx lwork_ = lasr3_workquery(side_, direct_, C_, S_, A_);
+        *work = lwork_;
+        return;
+    }
+
+    MemoryBlock<lapack_float_complex, lapack_idx> work_(lwork, work);
+    lasr3(side_, direct_, C_, S_, A_, work_);
+}
+
+void zlasr3(char side,
+            char direct,
+            lapack_idx m,
+            lapack_idx n,
+            lapack_idx k,
+            const double* C,
+            lapack_idx ldc,
+            const lapack_double_complex* S,
+            lapack_idx lds,
+            lapack_double_complex* A,
+            lapack_idx lda,
+            lapack_double_complex* work,
+            lapack_idx lwork)
+{
+    Side side_ = char2side(side);
+    Direction direct_ = char2direct(direct);
+    const lapack_idx n_rot = side_ == Side::Left ? m - 1 : n - 1;
+
+    ConstMatrix<double, Layout::ColMajor, lapack_idx> C_(n_rot, k, C, ldc);
+    ConstMatrix<lapack_double_complex, Layout::ColMajor, lapack_idx> S_(
+        n_rot, k, S, lds);
+    Matrix<lapack_double_complex, Layout::ColMajor, lapack_idx> A_(m, n, A,
+                                                                   lda);
+
+    if (lwork == -1) {
+        lapack_idx lwork_ = lasr3_workquery(side_, direct_, C_, S_, A_);
+        *work = lwork_;
+        return;
+    }
+
+    MemoryBlock<lapack_double_complex, lapack_idx> work_(lwork, work);
+    lasr3(side_, direct_, C_, S_, A_, work_);
+}
+
+void cslasr3(char side,
+             char direct,
+             lapack_idx m,
+             lapack_idx n,
+             lapack_idx k,
+             const float* C,
+             lapack_idx ldc,
+             const float* S,
+             lapack_idx lds,
+             lapack_float_complex* A,
+             lapack_idx lda,
+             lapack_float_complex* work,
+             lapack_idx lwork)
+{
+    Side side_ = char2side(side);
+    Direction direct_ = char2direct(direct);
+    const lapack_idx n_rot = side_ == Side::Left ? m - 1 : n - 1;
+
+    ConstMatrix<float, Layout::ColMajor, lapack_idx> C_(n_rot, k, C, ldc);
+    ConstMatrix<float, Layout::ColMajor, lapack_idx> S_(n_rot, k, S, lds);
+    Matrix<lapack_float_complex, Layout::ColMajor, lapack_idx> A_(m, n, A, lda);
+
+    if (lwork == -1) {
+        lapack_idx lwork_ = lasr3_workquery(side_, direct_, C_, S_, A_);
+        *work = lwork_;
+        return;
+    }
+
+    MemoryBlock<lapack_float_complex, lapack_idx> work_(lwork, work);
+    lasr3(side_, direct_, C_, S_, A_, work_);
+}
+
+void zdlasr3(char side,
+             char direct,
+             lapack_idx m,
+             lapack_idx n,
+             lapack_idx k,
+             const double* C,
+             lapack_idx ldc,
+             const double* S,
+             lapack_idx lds,
+             lapack_double_complex* A,
+             lapack_idx lda,
+             lapack_double_complex* work,
+             lapack_idx lwork)
+{
+    Side side_ = char2side(side);
+    Direction direct_ = char2direct(direct);
+    const lapack_idx n_rot = side_ == Side::Left ? m - 1 : n - 1;
+
+    ConstMatrix<double, Layout::ColMajor, lapack_idx> C_(n_rot, k, C, ldc);
+    ConstMatrix<double, Layout::ColMajor, lapack_idx> S_(n_rot, k, S, lds);
+    Matrix<lapack_double_complex, Layout::ColMajor, lapack_idx> A_(m, n, A,
+                                                                   lda);
+
+    if (lwork == -1) {
+        lapack_idx lwork_ = lasr3_workquery(side_, direct_, C_, S_, A_);
+        *work = lwork_;
+        return;
+    }
+
+    MemoryBlock<lapack_double_complex, lapack_idx> work_(lwork, work);
+    lasr3(side_, direct_, C_, S_, A_, work_);
+}
+
+}  // extern "C"
\ No newline at end of file
diff --git a/LAPACK_CPP/src/lapack_c/rot_c.f90 b/LAPACK_CPP/src/lapack_c/rot_c.f90
new file mode 100644
index 000000000..6876c9036
--- /dev/null
+++ b/LAPACK_CPP/src/lapack_c/rot_c.f90
@@ -0,0 +1,57 @@
+subroutine lapack_c_drot(n, dx ,incx, dy, incy, c, s) bind(c, name='lapack_c_drot')
+    use iso_c_binding
+    implicit none
+    integer(kind=c_int), value, intent(in) :: n, incx, incy
+    real(kind=c_double), value, intent(in) :: c, s
+    real(kind=c_double), dimension(*), intent(inout) :: dx, dy
+
+    external drot
+
+    ! Forward call to fortran implementation
+    call drot( n, dx ,incx, dy, incy, c, s )
+
+end subroutine lapack_c_drot
+
+subroutine lapack_c_srot(n, dx ,incx, dy, incy, c, s) bind(c, name='lapack_c_srot')
+    use iso_c_binding
+    implicit none
+    integer(kind=c_int), value, intent(in) :: n, incx, incy
+    real(kind=c_float), value, intent(in) :: c, s
+    real(kind=c_float), dimension(*), intent(inout) :: dx, dy
+
+    external srot
+
+    ! Forward call to fortran implementation
+    call srot( n, dx ,incx, dy, incy, c, s )
+
+end subroutine lapack_c_srot
+
+subroutine lapack_c_crot(n, dx ,incx, dy, incy, c, s) bind(c, name='lapack_c_crot')
+    use iso_c_binding
+    implicit none
+    integer(kind=c_int), value, intent(in) :: n, incx, incy
+    real(kind=c_float), value, intent(in) :: c
+    complex(kind=c_float_complex), value, intent(in) :: s
+    complex(kind=c_float_complex), dimension(*), intent(inout) :: dx, dy
+
+    external crot
+
+    ! Forward call to fortran implementation
+    call crot( n, dx ,incx, dy, incy, c, s )
+
+end subroutine lapack_c_crot
+
+subroutine lapack_c_zrot(n, dx ,incx, dy, incy, c, s) bind(c, name='lapack_c_zrot')
+    use iso_c_binding
+    implicit none
+    integer(kind=c_int), value, intent(in) :: n, incx, incy
+    real(kind=c_double), value, intent(in) :: c
+    complex(kind=c_double_complex), value, intent(in) :: s
+    complex(kind=c_double_complex), dimension(*), intent(inout) :: dx, dy
+
+    external zrot
+
+    ! Forward call to fortran implementation
+    call zrot( n, dx ,incx, dy, incy, c, s )
+
+end subroutine lapack_c_zrot
\ No newline at end of file
diff --git a/LAPACK_CPP/src/lapack_cpp/lartg_cpp.cpp b/LAPACK_CPP/src/lapack_cpp/lartg_cpp.cpp
new file mode 100644
index 000000000..576beca4a
--- /dev/null
+++ b/LAPACK_CPP/src/lapack_cpp/lartg_cpp.cpp
@@ -0,0 +1,43 @@
+#include <cassert>
+#include <complex>
+#include <type_traits>
+
+#include "lapack_c.h"
+#include "lapack_cpp/base.hpp"
+#include "lapack_cpp/lapack/lartg.hpp"
+
+namespace lapack_cpp {
+
+template <>
+void lartg<float>(float f, float g, float& c, float& s, float& r)
+{
+    lapack_c_slartg(f, g, &c, &s, &r);
+}
+
+template <>
+void lartg<double>(double f, double g, double& c, double& s, double& r)
+{
+    lapack_c_dlartg(f, g, &c, &s, &r);
+}
+
+template <>
+void lartg<std::complex<float>>(std::complex<float> f,
+                                std::complex<float> g,
+                                float& c,
+                                std::complex<float>& s,
+                                std::complex<float>& r)
+{
+    lapack_c_clartg(f, g, &c, &s, &r);
+}
+
+template <>
+void lartg<std::complex<double>>(std::complex<double> f,
+                                 std::complex<double> g,
+                                 double& c,
+                                 std::complex<double>& s,
+                                 std::complex<double>& r)
+{
+    lapack_c_zlartg(f, g, &c, &s, &r);
+}
+
+}  // namespace lapack_cpp
\ No newline at end of file
diff --git a/LAPACK_CPP/src/lapack_cpp/rot_cpp.cpp b/LAPACK_CPP/src/lapack_cpp/rot_cpp.cpp
new file mode 100644
index 000000000..a50a35607
--- /dev/null
+++ b/LAPACK_CPP/src/lapack_cpp/rot_cpp.cpp
@@ -0,0 +1,67 @@
+#ifdef USE_FORTRAN_BLAS
+    #include <cassert>
+    #include <complex>
+    #include <type_traits>
+
+    #include "lapack_c.h"
+    #include "lapack_cpp/base.hpp"
+    #include "lapack_cpp/blas/rot.hpp"
+
+namespace lapack_cpp {
+
+/**
+ * Templated wrapper around c functions, will be instantiated for each type.
+ *
+ * @tparam T
+ * @param A
+ * @param B
+ * @param C
+ */
+template <typename T, typename TC, typename TS, typename idx_t>
+inline void rot_c_wrapper(const Vector<T, idx_t>& x,
+                          const Vector<T, idx_t>& y,
+                          TC c,
+                          TS s)
+{
+    assert(x.size() == y.size());
+
+    if constexpr (std::is_same<T, double>::value) {
+        lapack_c_drot(x.size(), x.ptr(), x.stride(), y.ptr(), y.stride(), c, s);
+    }
+    else if constexpr (std::is_same<T, float>::value) {
+        lapack_c_srot(x.size(), x.ptr(), x.stride(), y.ptr(), y.stride(), c, s);
+    }
+    else if constexpr (std::is_same<T, std::complex<float>>::value) {
+        lapack_c_crot(x.size(), x.ptr(), x.stride(), y.ptr(), y.stride(), c, s);
+    }
+    else if constexpr (std::is_same<T, std::complex<double>>::value) {
+        lapack_c_zrot(x.size(), x.ptr(), x.stride(), y.ptr(), y.stride(), c, s);
+    }
+    else {
+        assert(false);
+    }
+}
+
+    // We have a lot of types to instantiate for, so we use a macro to avoid
+    // repetition.
+    #define INSTANTIATE_ROT(T, TC, idx_t)                                    \
+        template <>                                                          \
+        void rot(const Vector<T, idx_t>& x, const Vector<T, idx_t>& y, TC c, \
+                 T s)                                                        \
+        {                                                                    \
+            rot_c_wrapper(x, y, c, s);                                       \
+        }
+
+INSTANTIATE_ROT(float, float, lapack_idx_t)
+INSTANTIATE_ROT(double, double, lapack_idx_t)
+INSTANTIATE_ROT(std::complex<float>, float, lapack_idx_t)
+INSTANTIATE_ROT(std::complex<double>, double, lapack_idx_t)
+INSTANTIATE_ROT(float, float, int)
+INSTANTIATE_ROT(double, double, int)
+INSTANTIATE_ROT(std::complex<float>, float, int)
+INSTANTIATE_ROT(std::complex<double>, double, int)
+
+    #undef INSTANTIATE_ROT
+
+}  // namespace lapack_cpp
+#endif  // USE_FORTRAN_BLAS
\ No newline at end of file

From 1a95ff9b675b929c6e9ca4bd032f307ce3bb1c1d Mon Sep 17 00:00:00 2001
From: thijssteel <thijs.steel@kuleuven.be>
Date: Thu, 29 Feb 2024 16:21:40 +0100
Subject: [PATCH 02/10] add fortran wrappers for lasr3

---
 LAPACK_CPP/Makefile                      |  14 +-
 LAPACK_CPP/include/lapack_c/lasr3.h      | 159 +++++-----
 LAPACK_CPP/src/lapack_c/lasr3_c.cpp      | 379 ++++++++++++-----------
 LAPACK_CPP/src/lapack_fortran/clasr3.f90 |  32 ++
 LAPACK_CPP/src/lapack_fortran/dlasr3.f90 |  30 ++
 LAPACK_CPP/src/lapack_fortran/slasr3.f90 |  30 ++
 LAPACK_CPP/src/lapack_fortran/zlasr3.f90 |  32 ++
 7 files changed, 410 insertions(+), 266 deletions(-)
 create mode 100644 LAPACK_CPP/src/lapack_fortran/clasr3.f90
 create mode 100644 LAPACK_CPP/src/lapack_fortran/dlasr3.f90
 create mode 100644 LAPACK_CPP/src/lapack_fortran/slasr3.f90
 create mode 100644 LAPACK_CPP/src/lapack_fortran/zlasr3.f90

diff --git a/LAPACK_CPP/Makefile b/LAPACK_CPP/Makefile
index 6d0d8eb0c..cd5adf0bf 100644
--- a/LAPACK_CPP/Makefile
+++ b/LAPACK_CPP/Makefile
@@ -14,11 +14,19 @@ HEADERS = $(wildcard lapack_c/include/*.h) \
 		  $(wildcard lapack_cpp/include/*/*.hpp) \
 		  $(wildcard lapack_cpp/include/*/*/*.hpp)
 
-OBJFILES = src/lapack_c/rot_c.o \
+OBJFILES = src/lapack_cpp/rot_cpp.o \
+		   src/lapack_cpp/lartg_cpp.o \
+		   src/lapack_c/rot_c.o \
 		   src/lapack_c/lartg_c.o \
 		   src/lapack_c/lasr3_c.o \
-		   src/lapack_cpp/rot_cpp.o \
-		   src/lapack_cpp/lartg_cpp.o \
+		   src/lapack_fortran/slasr3.o \
+		   src/lapack_fortran/dlasr3.o \
+		   src/lapack_fortran/clasr3.o \
+		   src/lapack_fortran/zlasr3.o \
+
+# Fortran files
+src/lapack_fortran/%.o: src/lapack_fortran/%.f90
+	$(FC) $(FFLAGS) -cpp -c -o $@ $<
 
 # C files
 
diff --git a/LAPACK_CPP/include/lapack_c/lasr3.h b/LAPACK_CPP/include/lapack_c/lasr3.h
index efd86a916..e86479730 100644
--- a/LAPACK_CPP/include/lapack_c/lasr3.h
+++ b/LAPACK_CPP/include/lapack_c/lasr3.h
@@ -4,91 +4,96 @@
 #include "lapack_c/util.h"
 
 #ifdef __cplusplus
-extern "C" {
-#endif  // __cplusplus
+extern "C"
+{
+#endif // __cplusplus
 
-void lapack_c_slasr3(char side,
-                     char direct,
-                     lapack_idx m,
-                     lapack_idx n,
-                     lapack_idx k,
-                     const float* C,
-                     lapack_idx ldc,
-                     const float* S,
-                     lapack_idx lds,
-                     float* A,
-                     lapack_idx lda,
-                     float* work,
-                     lapack_idx lwork);
+    void lapack_c_slasr3(char side,
+                         char direct,
+                         lapack_idx m,
+                         lapack_idx n,
+                         lapack_idx k,
+                         const float *C,
+                         lapack_idx ldc,
+                         const float *S,
+                         lapack_idx lds,
+                         float *A,
+                         lapack_idx lda,
+                         float *work,
+                         lapack_idx lwork);
 
-void lapack_c_dlasr3(char side,
-                     char direct,
-                     lapack_idx m,
-                     lapack_idx n,
-                     lapack_idx k,
-                     const double* C,
-                     lapack_idx ldc,
-                     const double* S,
-                     lapack_idx lds,
-                     double* A,
-                     lapack_idx lda,
-                     double* work,
-                     lapack_idx lwork);
+    void lapack_c_dlasr3(char side,
+                         char direct,
+                         lapack_idx m,
+                         lapack_idx n,
+                         lapack_idx k,
+                         const double *C,
+                         lapack_idx ldc,
+                         const double *S,
+                         lapack_idx lds,
+                         double *A,
+                         lapack_idx lda,
+                         double *work,
+                         lapack_idx lwork);
 
-void lapack_c_clasr3(char side,
-                     char direct,
-                     lapack_idx m,
-                     lapack_idx n,
-                     lapack_idx k,
-                     const float* C,
-                     lapack_idx ldc,
-                     const lapack_float_complex* S,
-                     lapack_idx lds,
-                     lapack_float_complex* A,
-                     lapack_idx lda,
-                     lapack_float_complex* work,
-                     lapack_idx lwork);
+    void lapack_c_clasr3(char side,
+                         char direct,
+                         lapack_idx m,
+                         lapack_idx n,
+                         lapack_idx k,
+                         const float *C,
+                         lapack_idx ldc,
+                         const lapack_float_complex *S,
+                         lapack_idx lds,
+                         lapack_float_complex *A,
+                         lapack_idx lda,
+                         lapack_float_complex *work,
+                         lapack_idx lwork);
 
-void lapack_c_sclasr3(char side,
-                      char direct,
-                      lapack_idx m,
-                      lapack_idx n,
-                      lapack_idx k,
-                      const float* C,
-                      lapack_idx ldc,
-                      const float* S,
-                      lapack_idx lds,
-                      lapack_float_complex* A,
-                      lapack_idx lda,
-                      lapack_float_complex* work,
-                      lapack_idx lwork);
+    void lapack_c_sclasr3(char side,
+                          char direct,
+                          lapack_idx m,
+                          lapack_idx n,
+                          lapack_idx k,
+                          const float *C,
+                          lapack_idx ldc,
+                          const float *S,
+                          lapack_idx lds,
+                          lapack_float_complex *A,
+                          lapack_idx lda,
+                          lapack_float_complex *work,
+                          lapack_idx lwork);
 
-void lapack_c_zlasr3(lapack_idx m,
-                     lapack_idx n,
-                     lapack_idx k,
-                     const double* C,
-                     lapack_idx ldc,
-                     const lapack_double_complex* S,
-                     lapack_idx lds,
-                     lapack_double_complex* A,
-                     lapack_idx lda,
-                     lapack_double_complex* work,
-                     lapack_idx lwork);
+    void lapack_c_zlasr3(char side,
+                         char direct,
+                         lapack_idx m,
+                         lapack_idx n,
+                         lapack_idx k,
+                         const double *C,
+                         lapack_idx ldc,
+                         const lapack_double_complex *S,
+                         lapack_idx lds,
+                         lapack_double_complex *A,
+                         lapack_idx lda,
+                         lapack_double_complex *work,
+                         lapack_idx lwork);
 
-void lapack_c_dzlasr3(lapack_idx m,
-                      lapack_idx n,
-                      lapack_idx k,
-                      const double* C,
-                      lapack_idx ldc,
-                      const double* S,
-                      lapack_idx lds,
-                      lapack_double_complex* A,
-                      lapack_idx lda,
-                      lapack_double_complex* work,
-                      lapack_idx lwork);
+    void lapack_c_dzlasr3(char side,
+                          char direct, 
+                          lapack_idx m,
+                          lapack_idx n,
+                          lapack_idx k,
+                          const double *C,
+                          lapack_idx ldc,
+                          const double *S,
+                          lapack_idx lds,
+                          lapack_double_complex *A,
+                          lapack_idx lda,
+                          lapack_double_complex *work,
+                          lapack_idx lwork);
 
 #ifdef __cplusplus
 }
-#endif  // __cplusplus
+#endif // __cplusplus
 
 #endif
\ No newline at end of file
diff --git a/LAPACK_CPP/src/lapack_c/lasr3_c.cpp b/LAPACK_CPP/src/lapack_c/lasr3_c.cpp
index 344703e4e..8945d43ad 100644
--- a/LAPACK_CPP/src/lapack_c/lasr3_c.cpp
+++ b/LAPACK_CPP/src/lapack_c/lasr3_c.cpp
@@ -5,202 +5,209 @@
 using namespace lapack_cpp;
 
 // Actual definitions
-extern "C" {
-
-void slasr3(char side,
-            char direct,
-            lapack_idx m,
-            lapack_idx n,
-            lapack_idx k,
-            const float* C,
-            lapack_idx ldc,
-            const float* S,
-            lapack_idx lds,
-            float* A,
-            lapack_idx lda,
-            float* work,
-            lapack_idx lwork)
+extern "C"
 {
-    Side side_ = char2side(side);
-    Direction direct_ = char2direct(direct);
-    const lapack_idx n_rot = side_ == Side::Left ? m - 1 : n - 1;
-
-    ConstMatrix<float, Layout::ColMajor, lapack_idx> C_(n_rot, k, C, ldc);
-    ConstMatrix<float, Layout::ColMajor, lapack_idx> S_(n_rot, k, S, lds);
-    Matrix<float, Layout::ColMajor, lapack_idx> A_(m, n, A, lda);
-
-    if (lwork == -1) {
-        lapack_idx lwork_ = lasr3_workquery(side_, direct_, C_, S_, A_);
-        *work = lwork_;
-        return;
-    }
 
-    MemoryBlock<float, lapack_idx> work_(lwork, work);
-    lasr3(side_, direct_, C_, S_, A_, work_);
-}
-
-void dlasr3(char side,
-            char direct,
-            lapack_idx m,
-            lapack_idx n,
-            lapack_idx k,
-            const double* C,
-            lapack_idx ldc,
-            const double* S,
-            lapack_idx lds,
-            double* A,
-            lapack_idx lda,
-            double* work,
-            lapack_idx lwork)
-{
-    Side side_ = char2side(side);
-    Direction direct_ = char2direct(direct);
-    const lapack_idx n_rot = side_ == Side::Left ? m - 1 : n - 1;
-
-    ConstMatrix<double, Layout::ColMajor, lapack_idx> C_(n_rot, k, C, ldc);
-    ConstMatrix<double, Layout::ColMajor, lapack_idx> S_(n_rot, k, S, lds);
-    Matrix<double, Layout::ColMajor, lapack_idx> A_(m, n, A, lda);
-
-    if (lwork == -1) {
-        lapack_idx lwork_ = lasr3_workquery(side_, direct_, C_, S_, A_);
-        *work = lwork_;
-        return;
+    void lapack_c_slasr3(char side,
+                         char direct,
+                         lapack_idx m,
+                         lapack_idx n,
+                         lapack_idx k,
+                         const float *C,
+                         lapack_idx ldc,
+                         const float *S,
+                         lapack_idx lds,
+                         float *A,
+                         lapack_idx lda,
+                         float *work,
+                         lapack_idx lwork)
+    {
+        Side side_ = char2side(side);
+        Direction direct_ = char2direct(direct);
+        const lapack_idx n_rot = side_ == Side::Left ? m - 1 : n - 1;
+
+        ConstMatrix<float, Layout::ColMajor, lapack_idx> C_(n_rot, k, C, ldc);
+        ConstMatrix<float, Layout::ColMajor, lapack_idx> S_(n_rot, k, S, lds);
+        Matrix<float, Layout::ColMajor, lapack_idx> A_(m, n, A, lda);
+
+        if (lwork == -1)
+        {
+            lapack_idx lwork_ = lasr3_workquery(side_, direct_, C_, S_, A_);
+            *work = lwork_;
+            return;
+        }
+
+        MemoryBlock<float, lapack_idx> work_(lwork, work);
+        lasr3(side_, direct_, C_, S_, A_, work_);
     }
 
-    MemoryBlock<double, lapack_idx> work_(lwork, work);
-    lasr3(side_, direct_, C_, S_, A_, work_);
-}
-
-void clasr3(char side,
-            char direct,
-            lapack_idx m,
-            lapack_idx n,
-            lapack_idx k,
-            const float* C,
-            lapack_idx ldc,
-            const lapack_float_complex* S,
-            lapack_idx lds,
-            lapack_float_complex* A,
-            lapack_idx lda,
-            lapack_float_complex* work,
-            lapack_idx lwork)
-{
-    Side side_ = char2side(side);
-    Direction direct_ = char2direct(direct);
-    const lapack_idx n_rot = side_ == Side::Left ? m - 1 : n - 1;
-
-    ConstMatrix<float, Layout::ColMajor, lapack_idx> C_(n_rot, k, C, ldc);
-    ConstMatrix<lapack_float_complex, Layout::ColMajor, lapack_idx> S_(n_rot, k,
-                                                                       S, lds);
-    Matrix<lapack_float_complex, Layout::ColMajor, lapack_idx> A_(m, n, A, lda);
-
-    if (lwork == -1) {
-        lapack_idx lwork_ = lasr3_workquery(side_, direct_, C_, S_, A_);
-        *work = lwork_;
-        return;
+    void lapack_c_dlasr3(char side,
+                         char direct,
+                         lapack_idx m,
+                         lapack_idx n,
+                         lapack_idx k,
+                         const double *C,
+                         lapack_idx ldc,
+                         const double *S,
+                         lapack_idx lds,
+                         double *A,
+                         lapack_idx lda,
+                         double *work,
+                         lapack_idx lwork)
+    {
+        Side side_ = char2side(side);
+        Direction direct_ = char2direct(direct);
+        const lapack_idx n_rot = side_ == Side::Left ? m - 1 : n - 1;
+
+        ConstMatrix<double, Layout::ColMajor, lapack_idx> C_(n_rot, k, C, ldc);
+        ConstMatrix<double, Layout::ColMajor, lapack_idx> S_(n_rot, k, S, lds);
+        Matrix<double, Layout::ColMajor, lapack_idx> A_(m, n, A, lda);
+
+        if (lwork == -1)
+        {
+            lapack_idx lwork_ = lasr3_workquery(side_, direct_, C_, S_, A_);
+            *work = lwork_;
+            return;
+        }
+
+        MemoryBlock<double, lapack_idx> work_(lwork, work);
+        lasr3(side_, direct_, C_, S_, A_, work_);
     }
 
-    MemoryBlock<lapack_float_complex, lapack_idx> work_(lwork, work);
-    lasr3(side_, direct_, C_, S_, A_, work_);
-}
-
-void zlasr3(char side,
-            char direct,
-            lapack_idx m,
-            lapack_idx n,
-            lapack_idx k,
-            const double* C,
-            lapack_idx ldc,
-            const lapack_double_complex* S,
-            lapack_idx lds,
-            lapack_double_complex* A,
-            lapack_idx lda,
-            lapack_double_complex* work,
-            lapack_idx lwork)
-{
-    Side side_ = char2side(side);
-    Direction direct_ = char2direct(direct);
-    const lapack_idx n_rot = side_ == Side::Left ? m - 1 : n - 1;
-
-    ConstMatrix<double, Layout::ColMajor, lapack_idx> C_(n_rot, k, C, ldc);
-    ConstMatrix<lapack_double_complex, Layout::ColMajor, lapack_idx> S_(
-        n_rot, k, S, lds);
-    Matrix<lapack_double_complex, Layout::ColMajor, lapack_idx> A_(m, n, A,
-                                                                   lda);
-
-    if (lwork == -1) {
-        lapack_idx lwork_ = lasr3_workquery(side_, direct_, C_, S_, A_);
-        *work = lwork_;
-        return;
+    void lapack_c_clasr3(char side,
+                         char direct,
+                         lapack_idx m,
+                         lapack_idx n,
+                         lapack_idx k,
+                         const float *C,
+                         lapack_idx ldc,
+                         const lapack_float_complex *S,
+                         lapack_idx lds,
+                         lapack_float_complex *A,
+                         lapack_idx lda,
+                         lapack_float_complex *work,
+                         lapack_idx lwork)
+    {
+        Side side_ = char2side(side);
+        Direction direct_ = char2direct(direct);
+        const lapack_idx n_rot = side_ == Side::Left ? m - 1 : n - 1;
+
+        ConstMatrix<float, Layout::ColMajor, lapack_idx> C_(n_rot, k, C, ldc);
+        ConstMatrix<lapack_float_complex, Layout::ColMajor, lapack_idx> S_(n_rot, k,
+                                                                           S, lds);
+        Matrix<lapack_float_complex, Layout::ColMajor, lapack_idx> A_(m, n, A, lda);
+
+        if (lwork == -1)
+        {
+            lapack_idx lwork_ = lasr3_workquery(side_, direct_, C_, S_, A_);
+            *work = lwork_;
+            return;
+        }
+
+        MemoryBlock<lapack_float_complex, lapack_idx> work_(lwork, work);
+        lasr3(side_, direct_, C_, S_, A_, work_);
     }
 
-    MemoryBlock<lapack_double_complex, lapack_idx> work_(lwork, work);
-    lasr3(side_, direct_, C_, S_, A_, work_);
-}
-
-void cslasr3(char side,
-             char direct,
-             lapack_idx m,
-             lapack_idx n,
-             lapack_idx k,
-             const float* C,
-             lapack_idx ldc,
-             const float* S,
-             lapack_idx lds,
-             lapack_float_complex* A,
-             lapack_idx lda,
-             lapack_float_complex* work,
-             lapack_idx lwork)
-{
-    Side side_ = char2side(side);
-    Direction direct_ = char2direct(direct);
-    const lapack_idx n_rot = side_ == Side::Left ? m - 1 : n - 1;
-
-    ConstMatrix<float, Layout::ColMajor, lapack_idx> C_(n_rot, k, C, ldc);
-    ConstMatrix<float, Layout::ColMajor, lapack_idx> S_(n_rot, k, S, lds);
-    Matrix<lapack_float_complex, Layout::ColMajor, lapack_idx> A_(m, n, A, lda);
-
-    if (lwork == -1) {
-        lapack_idx lwork_ = lasr3_workquery(side_, direct_, C_, S_, A_);
-        *work = lwork_;
-        return;
+    void lapack_c_zlasr3(char side,
+                         char direct,
+                         lapack_idx m,
+                         lapack_idx n,
+                         lapack_idx k,
+                         const double *C,
+                         lapack_idx ldc,
+                         const lapack_double_complex *S,
+                         lapack_idx lds,
+                         lapack_double_complex *A,
+                         lapack_idx lda,
+                         lapack_double_complex *work,
+                         lapack_idx lwork)
+    {
+        Side side_ = char2side(side);
+        Direction direct_ = char2direct(direct);
+        const lapack_idx n_rot = side_ == Side::Left ? m - 1 : n - 1;
+
+        ConstMatrix<double, Layout::ColMajor, lapack_idx> C_(n_rot, k, C, ldc);
+        ConstMatrix<lapack_double_complex, Layout::ColMajor, lapack_idx> S_(
+            n_rot, k, S, lds);
+        Matrix<lapack_double_complex, Layout::ColMajor, lapack_idx> A_(m, n, A,
+                                                                       lda);
+
+        if (lwork == -1)
+        {
+            lapack_idx lwork_ = lasr3_workquery(side_, direct_, C_, S_, A_);
+            *work = lwork_;
+            return;
+        }
+
+        MemoryBlock<lapack_double_complex, lapack_idx> work_(lwork, work);
+        lasr3(side_, direct_, C_, S_, A_, work_);
     }
 
-    MemoryBlock<lapack_float_complex, lapack_idx> work_(lwork, work);
-    lasr3(side_, direct_, C_, S_, A_, work_);
-}
-
-void zdlasr3(char side,
-             char direct,
-             lapack_idx m,
-             lapack_idx n,
-             lapack_idx k,
-             const double* C,
-             lapack_idx ldc,
-             const double* S,
-             lapack_idx lds,
-             lapack_double_complex* A,
-             lapack_idx lda,
-             lapack_double_complex* work,
-             lapack_idx lwork)
-{
-    Side side_ = char2side(side);
-    Direction direct_ = char2direct(direct);
-    const lapack_idx n_rot = side_ == Side::Left ? m - 1 : n - 1;
-
-    ConstMatrix<double, Layout::ColMajor, lapack_idx> C_(n_rot, k, C, ldc);
-    ConstMatrix<double, Layout::ColMajor, lapack_idx> S_(n_rot, k, S, lds);
-    Matrix<lapack_double_complex, Layout::ColMajor, lapack_idx> A_(m, n, A,
-                                                                   lda);
-
-    if (lwork == -1) {
-        lapack_idx lwork_ = lasr3_workquery(side_, direct_, C_, S_, A_);
-        *work = lwork_;
-        return;
+    void lapack_c_sclasr3(char side,
+                          char direct,
+                          lapack_idx m,
+                          lapack_idx n,
+                          lapack_idx k,
+                          const float *C,
+                          lapack_idx ldc,
+                          const float *S,
+                          lapack_idx lds,
+                          lapack_float_complex *A,
+                          lapack_idx lda,
+                          lapack_float_complex *work,
+                          lapack_idx lwork)
+    {
+        Side side_ = char2side(side);
+        Direction direct_ = char2direct(direct);
+        const lapack_idx n_rot = side_ == Side::Left ? m - 1 : n - 1;
+
+        ConstMatrix<float, Layout::ColMajor, lapack_idx> C_(n_rot, k, C, ldc);
+        ConstMatrix<float, Layout::ColMajor, lapack_idx> S_(n_rot, k, S, lds);
+        Matrix<lapack_float_complex, Layout::ColMajor, lapack_idx> A_(m, n, A, lda);
+
+        if (lwork == -1)
+        {
+            lapack_idx lwork_ = lasr3_workquery(side_, direct_, C_, S_, A_);
+            *work = lwork_;
+            return;
+        }
+
+        MemoryBlock<lapack_float_complex, lapack_idx> work_(lwork, work);
+        lasr3(side_, direct_, C_, S_, A_, work_);
     }
 
-    MemoryBlock<lapack_double_complex, lapack_idx> work_(lwork, work);
-    lasr3(side_, direct_, C_, S_, A_, work_);
-}
+    void lapack_c_dzlasr3(char side,
+                          char direct,
+                          lapack_idx m,
+                          lapack_idx n,
+                          lapack_idx k,
+                          const double *C,
+                          lapack_idx ldc,
+                          const double *S,
+                          lapack_idx lds,
+                          lapack_double_complex *A,
+                          lapack_idx lda,
+                          lapack_double_complex *work,
+                          lapack_idx lwork)
+    {
+        Side side_ = char2side(side);
+        Direction direct_ = char2direct(direct);
+        const lapack_idx n_rot = side_ == Side::Left ? m - 1 : n - 1;
+
+        ConstMatrix<double, Layout::ColMajor, lapack_idx> C_(n_rot, k, C, ldc);
+        ConstMatrix<double, Layout::ColMajor, lapack_idx> S_(n_rot, k, S, lds);
+        Matrix<lapack_double_complex, Layout::ColMajor, lapack_idx> A_(m, n, A,
+                                                                       lda);
+
+        if (lwork == -1)
+        {
+            lapack_idx lwork_ = lasr3_workquery(side_, direct_, C_, S_, A_);
+            *work = lwork_;
+            return;
+        }
+
+        MemoryBlock<lapack_double_complex, lapack_idx> work_(lwork, work);
+        lasr3(side_, direct_, C_, S_, A_, work_);
+    }
 
-}  // extern "C"
\ No newline at end of file
+} // extern "C"
\ No newline at end of file
diff --git a/LAPACK_CPP/src/lapack_fortran/clasr3.f90 b/LAPACK_CPP/src/lapack_fortran/clasr3.f90
new file mode 100644
index 000000000..b6b2957d0
--- /dev/null
+++ b/LAPACK_CPP/src/lapack_fortran/clasr3.f90
@@ -0,0 +1,32 @@
+subroutine clasr3 ( side, direct, m, n, k, C, ldc, S, lds, A, lda, work, lwork )
+    use iso_c_binding, only: c_char
+    implicit none
+    ! Interface to C
+    interface
+        subroutine lapack_c_clasr3(side_c, direct_c, m_c, n_c, k_c, C_c,&
+             ldc_c, S_c, lds_c, A_c, lda_c, work_c, lwork_c) bind(c, name="lapack_c_clasr3")
+            use iso_c_binding, only: c_char, c_float, c_float_complex, c_int
+            implicit none
+            integer(c_int), intent(in) :: m_c,n_c,k_c,ldc_c, lds_c, lda_c, lwork_c
+            character(c_char), intent(in) :: side_c, direct_c
+            real(c_float), intent(in) :: C_c(ldc_c,*)
+            complex(c_float_complex), intent(in) :: S_c(lds_c,*)
+            complex(c_float_complex), intent(inout) :: A_c(lda_c,*), work_c(*)
+        end subroutine lapack_c_clasr3
+    end interface
+
+    ! Arguments
+    integer, intent(in) :: m,n,k,ldc, lds, lda, lwork
+    character, intent(in) :: side, direct
+    real, intent(in) :: C(ldc,*)
+    complex, intent(in) :: S(lds,*)
+    complex, intent(inout) :: A(lda,*), work(*)
+    character(c_char) :: side_to_c, direct_to_c
+
+    ! Convert characters to C characters
+    side_to_c = side
+    direct_to_c = direct
+
+    ! Call C function
+    call lapack_c_clasr3(side_to_c, direct_to_c, m, n, k, C, ldc, S, lds, A, lda, work, lwork)
+end subroutine clasr3
diff --git a/LAPACK_CPP/src/lapack_fortran/dlasr3.f90 b/LAPACK_CPP/src/lapack_fortran/dlasr3.f90
new file mode 100644
index 000000000..0a9392d7d
--- /dev/null
+++ b/LAPACK_CPP/src/lapack_fortran/dlasr3.f90
@@ -0,0 +1,30 @@
+subroutine dlasr3 ( side, direct, m, n, k, C, ldc, S, lds, A, lda, work, lwork )
+    use iso_c_binding, only: c_char
+    implicit none
+    ! Interface to C
+    interface
+        subroutine lapack_c_dlasr3(side_c, direct_c, m_c, n_c, k_c, C_c,&
+             ldc_c, S_c, lds_c, A_c, lda_c, work_c, lwork_c) bind(c, name="lapack_c_dlasr3")
+            use iso_c_binding, only: c_char, c_double, c_int
+            implicit none
+            integer(c_int), intent(in) :: m_c,n_c,k_c,ldc_c, lds_c, lda_c, lwork_c
+            character(c_char), intent(in) :: side_c, direct_c
+            real(c_double), intent(in) :: C_c(ldc_c,*),S_c(lds_c,*)
+            real(c_double), intent(inout) :: A_c(lda_c,*), work_c(*)
+        end subroutine lapack_c_dlasr3
+    end interface
+
+    ! Arguments
+    integer, intent(in) :: m,n,k,ldc, lds, lda, lwork
+    character, intent(in) :: side, direct
+    double precision, intent(in) :: C(ldc,*),S(lds,*)
+    double precision, intent(inout) :: A(lda,*), work(*)
+    character(c_char) :: side_to_c, direct_to_c
+
+    ! Convert characters to C characters
+    side_to_c = side
+    direct_to_c = direct
+
+    ! Call C function
+    call lapack_c_dlasr3(side_to_c, direct_to_c, m, n, k, C, ldc, S, lds, A, lda, work, lwork)
+end subroutine dlasr3
diff --git a/LAPACK_CPP/src/lapack_fortran/slasr3.f90 b/LAPACK_CPP/src/lapack_fortran/slasr3.f90
new file mode 100644
index 000000000..91ea69c63
--- /dev/null
+++ b/LAPACK_CPP/src/lapack_fortran/slasr3.f90
@@ -0,0 +1,30 @@
+subroutine slasr3 ( side, direct, m, n, k, C, ldc, S, lds, A, lda, work, lwork )
+    use iso_c_binding, only: c_char
+    implicit none
+    ! Interface to C
+    interface
+        subroutine lapack_c_slasr3(side_c, direct_c, m_c, n_c, k_c, C_c,&
+             ldc_c, S_c, lds_c, A_c, lda_c, work_c, lwork_c) bind(c, name="lapack_c_slasr3")
+            use iso_c_binding, only: c_char, c_float, c_int
+            implicit none
+            integer(c_int), intent(in) :: m_c,n_c,k_c,ldc_c, lds_c, lda_c, lwork_c
+            character(c_char), intent(in) :: side_c, direct_c
+            real(c_float), intent(in) :: C_c(ldc_c,*),S_c(lds_c,*)
+            real(c_float), intent(inout) :: A_c(lda_c,*), work_c(*)
+        end subroutine lapack_c_slasr3
+    end interface
+
+    ! Arguments
+    integer, intent(in) :: m,n,k,ldc, lds, lda, lwork
+    character, intent(in) :: side, direct
+    real, intent(in) :: C(ldc,*),S(lds,*)
+    real, intent(inout) :: A(lda,*), work(*)
+    character(c_char) :: side_to_c, direct_to_c
+
+    ! Convert characters to C characters
+    side_to_c = side
+    direct_to_c = direct
+
+    ! Call C function
+    call lapack_c_slasr3(side_to_c, direct_to_c, m, n, k, C, ldc, S, lds, A, lda, work, lwork)
+end subroutine slasr3
diff --git a/LAPACK_CPP/src/lapack_fortran/zlasr3.f90 b/LAPACK_CPP/src/lapack_fortran/zlasr3.f90
new file mode 100644
index 000000000..ad0e6c662
--- /dev/null
+++ b/LAPACK_CPP/src/lapack_fortran/zlasr3.f90
@@ -0,0 +1,32 @@
+subroutine zlasr3 ( side, direct, m, n, k, C, ldc, S, lds, A, lda, work, lwork )
+    use iso_c_binding, only: c_char
+    implicit none
+    ! Interface to C
+    interface
+        subroutine lapack_c_zlasr3(side_c, direct_c, m_c, n_c, k_c, C_c,&
+             ldc_c, S_c, lds_c, A_c, lda_c, work_c, lwork_c) bind(c, name="lapack_c_zlasr3")
+            use iso_c_binding, only: c_char, c_double, c_double_complex, c_int
+            implicit none
+            integer(c_int), intent(in) :: m_c,n_c,k_c,ldc_c, lds_c, lda_c, lwork_c
+            character(c_char), intent(in) :: side_c, direct_c
+            real(c_double), intent(in) :: C_c(ldc_c,*)
+            complex(c_double_complex), intent(in) :: S_c(lds_c,*)
+            complex(c_double_complex), intent(inout) :: A_c(lda_c,*), work_c(*)
+        end subroutine lapack_c_zlasr3
+    end interface
+
+    ! Arguments
+    integer, intent(in) :: m,n,k,ldc, lds, lda, lwork
+    character, intent(in) :: side, direct
+    double precision, intent(in) :: C(ldc,*)
+    double complex, intent(in) :: S(lds,*)
+    double complex, intent(inout) :: A(lda,*), work(*)
+    character(c_char) :: side_to_c, direct_to_c
+
+    ! Convert characters to C characters
+    side_to_c = side
+    direct_to_c = direct
+
+    ! Call C function
+    call lapack_c_zlasr3(side_to_c, direct_to_c, m, n, k, C, ldc, S, lds, A, lda, work, lwork)
+end subroutine zlasr3

From 2bdd724ecf85630cea3db91adc910f01d3701b02 Mon Sep 17 00:00:00 2001
From: thijssteel <thijs.steel@kuleuven.be>
Date: Fri, 1 Mar 2024 18:10:08 +0100
Subject: [PATCH 03/10] small change in template of lasr3

---
 LAPACK_CPP/Makefile                           | 14 ++---
 LAPACK_CPP/example/profile_lasr3.cpp          | 26 ++++++---
 .../include/lapack_cpp/lapack/lasr3.hpp       | 54 +++++++++----------
 3 files changed, 51 insertions(+), 43 deletions(-)

diff --git a/LAPACK_CPP/Makefile b/LAPACK_CPP/Makefile
index cd5adf0bf..ec6b653a4 100644
--- a/LAPACK_CPP/Makefile
+++ b/LAPACK_CPP/Makefile
@@ -7,12 +7,12 @@ FFLAGS=-Wall -Wextra -pedantic -g
 
 all: ./example/test_lasr3 ./example/profile_lasr3
 
-HEADERS = $(wildcard lapack_c/include/*.h) \
-		  $(wildcard lapack_c/include/*/*.h) \
-		  $(wildcard lapack_c/include/*/*/*.h) \
-		  $(wildcard lapack_cpp/include/*.hpp) \
-		  $(wildcard lapack_cpp/include/*/*.hpp) \
-		  $(wildcard lapack_cpp/include/*/*/*.hpp)
+HEADERS = $(wildcard include/lapack_c/*.h) \
+		  $(wildcard include/lapack_c/*/*.h) \
+		  $(wildcard include/lapack_c/*/*/*.h) \
+		  $(wildcard include/lapack_cpp/*.hpp) \
+		  $(wildcard include/lapack_cpp/*/*.hpp) \
+		  $(wildcard include/lapack_cpp/*/*/*.hpp)
 
 OBJFILES = src/lapack_cpp/rot_cpp.o \
 		   src/lapack_cpp/lartg_cpp.o \
@@ -30,7 +30,7 @@ src/lapack_fortran/%.o: src/lapack_fortran/%.f90
 
 # C files
 
-src/lapack_c/%.o: src/lapack_c/%.cpp
+src/lapack_c/%.o: src/lapack_c/%.cpp $(HEADERS)
 	$(CXX) $(CXXFLAGS) -I./include -c -o $@ $<
 
 src/lapack_c/%.o: src/lapack_c/%.f90
diff --git a/LAPACK_CPP/example/profile_lasr3.cpp b/LAPACK_CPP/example/profile_lasr3.cpp
index 456af8df8..2e79a99de 100644
--- a/LAPACK_CPP/example/profile_lasr3.cpp
+++ b/LAPACK_CPP/example/profile_lasr3.cpp
@@ -1,5 +1,5 @@
 
-#include <chrono>  // for high_resolution_clock
+#include <chrono> // for high_resolution_clock
 #include <complex>
 #include <iostream>
 
@@ -28,8 +28,10 @@ void profile_lasr3(idx_t m, idx_t n, idx_t k, Side side, Direction direction)
     // Generate random, but valid rotations
     randomize(C);
     randomize(S);
-    for (idx_t i = 0; i < n_rot; ++i) {
-        for (idx_t j = 0; j < k; ++j) {
+    for (idx_t i = 0; i < n_rot; ++i)
+    {
+        for (idx_t j = 0; j < k; ++j)
+        {
             T f = C(i, j);
             T g = S(i, j);
             T r;
@@ -49,7 +51,8 @@ void profile_lasr3(idx_t m, idx_t n, idx_t k, Side side, Direction direction)
     const idx_t n_warmup = 20;
     std::vector<float> timings(n_timings);
 
-    for (idx_t i = 0; i < n_timings; ++i) {
+    for (idx_t i = 0; i < n_timings; ++i)
+    {
         A = A_copy;
         auto start = std::chrono::high_resolution_clock::now();
         lasr3(side, direction, C.as_const(), S.as_const(), A, work);
@@ -62,6 +65,11 @@ void profile_lasr3(idx_t m, idx_t n, idx_t k, Side side, Direction direction)
         mean += timings[i];
     mean /= n_timings - n_warmup;
 
+    float std_dev = 0;
+    for (idx_t i = n_warmup; i < n_timings; ++i)
+        std_dev += (timings[i] - mean) * (timings[i] - mean);
+    std_dev = std::sqrt(std_dev / (n_timings - n_warmup - 1));
+
     long nflops =
         side == Side::Left ? 6. * (m - 1) * n * k : 6. * (n - 1) * m * k;
 
@@ -69,6 +77,7 @@ void profile_lasr3(idx_t m, idx_t n, idx_t k, Side side, Direction direction)
               << ", side = " << (char)side
               << ", direction = " << (char)direction << ", mean time = " << mean
               << " s"
+              << ", std dev = " << std_dev / mean * 100 << " %"
               << ", flop rate = " << nflops / mean * 1.0e-9 << " GFlops"
               << std::endl;
 }
@@ -81,12 +90,15 @@ int main()
     const idx_t nb = 1000;
     const idx_t k = 64;
 
-    for (int s = 0; s < 2; ++s) {
+    for (int s = 0; s < 2; ++s)
+    {
         Side side = s == 0 ? Side::Left : Side::Right;
-        for (int d = 0; d < 2; ++d) {
+        for (int d = 0; d < 2; ++d)
+        {
             Direction direction =
                 d == 0 ? Direction::Forward : Direction::Backward;
-            for (idx_t n_rot = 99; n_rot <= 1600; n_rot += 100) {
+            for (idx_t n_rot = 99; n_rot <= 1600; n_rot += 100)
+            {
                 idx_t m = side == Side::Left ? n_rot + 1 : nb;
                 idx_t n = side == Side::Left ? nb : n_rot + 1;
 
diff --git a/LAPACK_CPP/include/lapack_cpp/lapack/lasr3.hpp b/LAPACK_CPP/include/lapack_cpp/lapack/lasr3.hpp
index 0a3ffbb8a..535ca4ce6 100644
--- a/LAPACK_CPP/include/lapack_cpp/lapack/lasr3.hpp
+++ b/LAPACK_CPP/include/lapack_cpp/lapack/lasr3.hpp
@@ -33,7 +33,7 @@ namespace lapack_cpp {
  *
  * @tparam T     The type of the elements of the matrix A.
  *
- * @tparam TC    The type of the elements of the matrix C.
+ * @tparam real_t<T>    The type of the elements of the matrix C.
  *
  * @tparam TS    The type of the elements of the matrix S.
  *
@@ -41,14 +41,14 @@ namespace lapack_cpp {
  *
  * @tparam idx_t  The type of the indices.
  *
- * @note if either TC or TS is a complex type, then T must also be a complex
+ * @note if either real_t<T> or TS is a complex type, then T must also be a complex
  * type.
  *
  */
-template <typename T, typename TC, typename TS, Layout layout, typename idx_t>
+template <typename T, typename TS, Layout layout, typename idx_t>
 void lasr3(Side side,
            Direction direct,
-           ConstMatrix<TC, layout, idx_t> C,
+           ConstMatrix<real_t<T>, layout, idx_t> C,
            ConstMatrix<TS, layout, idx_t> S,
            Matrix<T, layout, idx_t> A,
            MemoryBlock<T, idx_t, true> work);
@@ -56,10 +56,10 @@ void lasr3(Side side,
 /**
  * Workspace query for lasr3.
  */
-template <typename T, typename TC, typename TS, Layout layout, typename idx_t>
+template <typename T, typename TS, Layout layout, typename idx_t>
 idx_t lasr3_workquery(Side side,
                       Direction direct,
-                      ConstMatrix<TC, layout, idx_t> C,
+                      ConstMatrix<real_t<T>, layout, idx_t> C,
                       ConstMatrix<TS, layout, idx_t> S,
                       Matrix<T, layout, idx_t> A)
 {
@@ -89,10 +89,10 @@ idx_t lasr3_workquery(Side side,
 /**
  * @copydoc lasr3
  */
-template <typename T, typename TC, typename TS, Layout layout, typename idx_t>
+template <typename T, typename TS, Layout layout, typename idx_t>
 void lasr3(Side side,
            Direction direct,
-           ConstMatrix<TC, layout, idx_t> C,
+           ConstMatrix<real_t<T>, layout, idx_t> C,
            ConstMatrix<TS, layout, idx_t> S,
            Matrix<T, layout, idx_t> A)
 {
@@ -117,13 +117,13 @@ namespace internal {
      * @param c2  The cosine of the second rotation.
      * @param s2  The sine of the second rotation.
      */
-    template <typename T, typename TC, typename TS, typename idx_t>
+    template <typename T, typename TS, typename idx_t>
     void rot_fuse2x1(Vector<T, idx_t> x1,
                      Vector<T, idx_t> x2,
                      Vector<T, idx_t> x3,
-                     TC c1,
+                     real_t<T> c1,
                      TS s1,
-                     TC c2,
+                     real_t<T> c2,
                      TS s2)
     {
         assert(x1.size() == x2.size() && x2.size() == x3.size());
@@ -150,13 +150,13 @@ namespace internal {
      * @param c2  The cosine of the second rotation.
      * @param s2  The sine of the second rotation.
      */
-    template <typename T, typename TC, typename TS, typename idx_t>
+    template <typename T, typename TS, typename idx_t>
     void rot_fuse1x2(Vector<T, idx_t> x1,
                      Vector<T, idx_t> x2,
                      Vector<T, idx_t> x3,
-                     TC c1,
+                     real_t<T> c1,
                      TS s1,
-                     TC c2,
+                     real_t<T> c2,
                      TS s2)
     {
         assert(x1.size() == x2.size() && x2.size() == x3.size());
@@ -192,18 +192,18 @@ namespace internal {
      * @param c4  The cosine of the fourth rotation.
      * @param s4  The sine of the fourth rotation.
      */
-    template <typename T, typename TC, typename TS, typename idx_t>
+    template <typename T, typename TS, typename idx_t>
     void rot_fuse2x2(Vector<T, idx_t> x1,
                      Vector<T, idx_t> x2,
                      Vector<T, idx_t> x3,
                      Vector<T, idx_t> x4,
-                     TC c1,
+                     real_t<T> c1,
                      TS s1,
-                     TC c2,
+                     real_t<T> c2,
                      TS s2,
-                     TC c3,
+                     real_t<T> c3,
                      TS s3,
-                     TC c4,
+                     real_t<T> c4,
                      TS s4)
     {
         assert(x1.size() == x2.size() && x1.size() == x3.size() &&
@@ -223,11 +223,10 @@ namespace internal {
 
     // Kernel for lasr3, forward left variant
     template <typename T,
-              typename TC,
               typename TS,
               Layout layout,
               typename idx_t>
-    void lasr3_kernel_forward_left(ConstMatrix<TC, layout, idx_t> C,
+    void lasr3_kernel_forward_left(ConstMatrix<real_t<T>, layout, idx_t> C,
                                    ConstMatrix<TS, layout, idx_t> S,
                                    Matrix<T, layout, idx_t> A,
                                    MemoryBlock<T, idx_t, true> work)
@@ -338,11 +337,10 @@ namespace internal {
 
     // Kernel for lasr3, backward left variant
     template <typename T,
-              typename TC,
               typename TS,
               Layout layout,
               typename idx_t>
-    void lasr3_kernel_backward_left(ConstMatrix<TC, layout, idx_t> C,
+    void lasr3_kernel_backward_left(ConstMatrix<real_t<T>, layout, idx_t> C,
                                     ConstMatrix<TS, layout, idx_t> S,
                                     Matrix<T, layout, idx_t> A,
                                     MemoryBlock<T, idx_t, true> work)
@@ -457,11 +455,10 @@ namespace internal {
 
     // Kernel for lasr3, forward right variant
     template <typename T,
-              typename TC,
               typename TS,
               Layout layout,
               typename idx_t>
-    void lasr3_kernel_forward_right(ConstMatrix<TC, layout, idx_t> C,
+    void lasr3_kernel_forward_right(ConstMatrix<real_t<T>, layout, idx_t> C,
                                     ConstMatrix<TS, layout, idx_t> S,
                                     Matrix<T, layout, idx_t> A,
                                     MemoryBlock<T, idx_t, true> work)
@@ -573,11 +570,10 @@ namespace internal {
 
     // Kernel for lasr3, backward right variant
     template <typename T,
-              typename TC,
               typename TS,
               Layout layout,
               typename idx_t>
-    void lasr3_kernel_backward_right(ConstMatrix<TC, layout, idx_t> C,
+    void lasr3_kernel_backward_right(ConstMatrix<real_t<T>, layout, idx_t> C,
                                      ConstMatrix<TS, layout, idx_t> S,
                                      Matrix<T, layout, idx_t> A,
                                      MemoryBlock<T, idx_t, true> work)
@@ -696,10 +692,10 @@ namespace internal {
 /**
  * @copydoc lasr3
  */
-template <typename T, typename TC, typename TS, Layout layout, typename idx_t>
+template <typename T, typename TS, Layout layout, typename idx_t>
 void lasr3(Side side,
            Direction direct,
-           ConstMatrix<TC, layout, idx_t> C,
+           ConstMatrix<real_t<T>, layout, idx_t> C,
            ConstMatrix<TS, layout, idx_t> S,
            Matrix<T, layout, idx_t> A,
            MemoryBlock<T, idx_t, true> work)

From f57a79230497f03e2b7b0fd58f01a3b21e2040cf Mon Sep 17 00:00:00 2001
From: thijssteel <thijs.steel@kuleuven.be>
Date: Fri, 1 Mar 2024 18:10:27 +0100
Subject: [PATCH 04/10] use direct overloading for rot

---
 LAPACK_CPP/src/lapack_cpp/rot_cpp.cpp | 118 ++++++++++++++++----------
 1 file changed, 72 insertions(+), 46 deletions(-)

diff --git a/LAPACK_CPP/src/lapack_cpp/rot_cpp.cpp b/LAPACK_CPP/src/lapack_cpp/rot_cpp.cpp
index a50a35607..ce8609ca0 100644
--- a/LAPACK_CPP/src/lapack_cpp/rot_cpp.cpp
+++ b/LAPACK_CPP/src/lapack_cpp/rot_cpp.cpp
@@ -9,59 +9,85 @@
 
 namespace lapack_cpp {
 
-/**
- * Templated wrapper around c functions, will be instantiated for each type.
- *
- * @tparam T
- * @param A
- * @param B
- * @param C
- */
-template <typename T, typename TC, typename TS, typename idx_t>
-inline void rot_c_wrapper(const Vector<T, idx_t>& x,
-                          const Vector<T, idx_t>& y,
-                          TC c,
-                          TS s)
+template<>
+void rot(const Vector<float, lapack_idx_t>& x,
+         const Vector<float, lapack_idx_t>& y,
+         float c,
+         float s)
 {
     assert(x.size() == y.size());
+    lapack_c_srot(x.size(), x.ptr(), x.stride(), y.ptr(), y.stride(), c, s);
+}
+
+template<>
+void rot(const Vector<double, lapack_idx_t>& x,
+         const Vector<double, lapack_idx_t>& y,
+         double c,
+         double s)
+{
+    assert(x.size() == y.size());
+    lapack_c_drot(x.size(), x.ptr(), x.stride(), y.ptr(), y.stride(), c, s);
+}
+
+template<>
+void rot(const Vector<std::complex<float>, lapack_idx_t>& x,
+         const Vector<std::complex<float>, lapack_idx_t>& y,
+         float c,
+         std::complex<float> s)
+{
+    assert(x.size() == y.size());
+    lapack_c_crot(x.size(), x.ptr(), x.stride(), y.ptr(), y.stride(), c, s);
+}
+
+template<>
+void rot(const Vector<std::complex<double>, lapack_idx_t>& x,
+         const Vector<std::complex<double>, lapack_idx_t>& y,
+         double c,
+         std::complex<double> s)
+{
+    assert(x.size() == y.size());
+    lapack_c_zrot(x.size(), x.ptr(), x.stride(), y.ptr(), y.stride(), c, s);
+}
 
-    if constexpr (std::is_same<T, double>::value) {
-        lapack_c_drot(x.size(), x.ptr(), x.stride(), y.ptr(), y.stride(), c, s);
-    }
-    else if constexpr (std::is_same<T, float>::value) {
-        lapack_c_srot(x.size(), x.ptr(), x.stride(), y.ptr(), y.stride(), c, s);
-    }
-    else if constexpr (std::is_same<T, std::complex<float>>::value) {
-        lapack_c_crot(x.size(), x.ptr(), x.stride(), y.ptr(), y.stride(), c, s);
-    }
-    else if constexpr (std::is_same<T, std::complex<double>>::value) {
-        lapack_c_zrot(x.size(), x.ptr(), x.stride(), y.ptr(), y.stride(), c, s);
-    }
-    else {
-        assert(false);
-    }
+template<>
+void rot(const Vector<float, int>& x,
+         const Vector<float, int>& y,
+         float c,
+         float s)
+{
+    assert(x.size() == y.size());
+    lapack_c_srot(x.size(), x.ptr(), x.stride(), y.ptr(), y.stride(), c, s);
 }
 
-    // We have a lot of types to instantiate for, so we use a macro to avoid
-    // repetition.
-    #define INSTANTIATE_ROT(T, TC, idx_t)                                    \
-        template <>                                                          \
-        void rot(const Vector<T, idx_t>& x, const Vector<T, idx_t>& y, TC c, \
-                 T s)                                                        \
-        {                                                                    \
-            rot_c_wrapper(x, y, c, s);                                       \
-        }
+template<>
+void rot(const Vector<double, int>& x,
+         const Vector<double, int>& y,
+         double c,
+         double s)
+{
+    assert(x.size() == y.size());
+    lapack_c_drot(x.size(), x.ptr(), x.stride(), y.ptr(), y.stride(), c, s);
+}
 
-INSTANTIATE_ROT(float, float, lapack_idx_t)
-INSTANTIATE_ROT(double, double, lapack_idx_t)
-INSTANTIATE_ROT(std::complex<float>, float, lapack_idx_t)
-INSTANTIATE_ROT(std::complex<double>, double, lapack_idx_t)
-INSTANTIATE_ROT(float, float, int)
-INSTANTIATE_ROT(double, double, int)
-INSTANTIATE_ROT(std::complex<float>, float, int)
-INSTANTIATE_ROT(std::complex<double>, double, int)
+template<>
+void rot(const Vector<std::complex<float>, int>& x,
+         const Vector<std::complex<float>, int>& y,
+         float c,
+         std::complex<float> s)
+{
+    assert(x.size() == y.size());
+    lapack_c_crot(x.size(), x.ptr(), x.stride(), y.ptr(), y.stride(), c, s);
+}
 
-    #undef INSTANTIATE_ROT
+template<>
+void rot(const Vector<std::complex<double>, int>& x,
+         const Vector<std::complex<double>, int>& y,
+         double c,
+         std::complex<double> s)
+{
+    assert(x.size() == y.size());
+    lapack_c_zrot(x.size(), x.ptr(), x.stride(), y.ptr(), y.stride(), c, s);
+}
 
 }  // namespace lapack_cpp
 #endif  // USE_FORTRAN_BLAS
\ No newline at end of file

From 81be118bcb799b921c7dc4270e184ef4d321bf20 Mon Sep 17 00:00:00 2001
From: thijssteel <thijs.steel@kuleuven.be>
Date: Mon, 4 Mar 2024 09:41:25 +0100
Subject: [PATCH 05/10] change filenames

---
 LAPACK_CPP/Makefile                                    | 10 +++++-----
 LAPACK_CPP/include/lapack_cpp/base/enums.hpp           |  5 +++++
 LAPACK_CPP/src/lapack_c/{lartg_c.f90 => lartg.f90}     |  0
 LAPACK_CPP/src/lapack_c/{lasr3_c.cpp => lasr3.cpp}     |  0
 LAPACK_CPP/src/lapack_c/{rot_c.f90 => rot.f90}         |  0
 LAPACK_CPP/src/lapack_cpp/{lartg_cpp.cpp => lartg.cpp} |  0
 LAPACK_CPP/src/lapack_cpp/{rot_cpp.cpp => rot.cpp}     |  0
 7 files changed, 10 insertions(+), 5 deletions(-)
 rename LAPACK_CPP/src/lapack_c/{lartg_c.f90 => lartg.f90} (100%)
 rename LAPACK_CPP/src/lapack_c/{lasr3_c.cpp => lasr3.cpp} (100%)
 rename LAPACK_CPP/src/lapack_c/{rot_c.f90 => rot.f90} (100%)
 rename LAPACK_CPP/src/lapack_cpp/{lartg_cpp.cpp => lartg.cpp} (100%)
 rename LAPACK_CPP/src/lapack_cpp/{rot_cpp.cpp => rot.cpp} (100%)

diff --git a/LAPACK_CPP/Makefile b/LAPACK_CPP/Makefile
index ec6b653a4..46939582e 100644
--- a/LAPACK_CPP/Makefile
+++ b/LAPACK_CPP/Makefile
@@ -14,11 +14,11 @@ HEADERS = $(wildcard include/lapack_c/*.h) \
 		  $(wildcard include/lapack_cpp/*/*.hpp) \
 		  $(wildcard include/lapack_cpp/*/*/*.hpp)
 
-OBJFILES = src/lapack_cpp/rot_cpp.o \
-		   src/lapack_cpp/lartg_cpp.o \
-		   src/lapack_c/rot_c.o \
-		   src/lapack_c/lartg_c.o \
-		   src/lapack_c/lasr3_c.o \
+OBJFILES = src/lapack_cpp/rot.o \
+		   src/lapack_cpp/lartg.o \
+		   src/lapack_c/rot.o \
+		   src/lapack_c/lartg.o \
+		   src/lapack_c/lasr3.o \
 		   src/lapack_fortran/slasr3.o \
 		   src/lapack_fortran/dlasr3.o \
 		   src/lapack_fortran/clasr3.o \
diff --git a/LAPACK_CPP/include/lapack_cpp/base/enums.hpp b/LAPACK_CPP/include/lapack_cpp/base/enums.hpp
index d3f929aca..6fd09bb0e 100644
--- a/LAPACK_CPP/include/lapack_cpp/base/enums.hpp
+++ b/LAPACK_CPP/include/lapack_cpp/base/enums.hpp
@@ -20,6 +20,7 @@ inline constexpr Op char2op(char t)
             return Op::ConjTrans;
         default:
             assert(false);
+            return Op::NoTrans;
     }
 }
 
@@ -34,6 +35,7 @@ inline constexpr Side char2side(char t)
             return Side::Right;
         default:
             assert(false);
+            return Side::Left;
     }
 }
 
@@ -48,6 +50,7 @@ inline constexpr Uplo char2uplo(char t)
             return Uplo::Lower;
         default:
             assert(false);
+            return Uplo::Upper;
     }
 }
 
@@ -62,6 +65,7 @@ inline constexpr Diag char2diag(char t)
             return Diag::NonUnit;
         default:
             assert(false);
+            return Diag::NonUnit;
     }
 }
 
@@ -76,6 +80,7 @@ inline constexpr Direction char2direct(char t)
             return Direction::Backward;
         default:
             assert(false);
+            return Direction::Forward;
     }
 }
 
diff --git a/LAPACK_CPP/src/lapack_c/lartg_c.f90 b/LAPACK_CPP/src/lapack_c/lartg.f90
similarity index 100%
rename from LAPACK_CPP/src/lapack_c/lartg_c.f90
rename to LAPACK_CPP/src/lapack_c/lartg.f90
diff --git a/LAPACK_CPP/src/lapack_c/lasr3_c.cpp b/LAPACK_CPP/src/lapack_c/lasr3.cpp
similarity index 100%
rename from LAPACK_CPP/src/lapack_c/lasr3_c.cpp
rename to LAPACK_CPP/src/lapack_c/lasr3.cpp
diff --git a/LAPACK_CPP/src/lapack_c/rot_c.f90 b/LAPACK_CPP/src/lapack_c/rot.f90
similarity index 100%
rename from LAPACK_CPP/src/lapack_c/rot_c.f90
rename to LAPACK_CPP/src/lapack_c/rot.f90
diff --git a/LAPACK_CPP/src/lapack_cpp/lartg_cpp.cpp b/LAPACK_CPP/src/lapack_cpp/lartg.cpp
similarity index 100%
rename from LAPACK_CPP/src/lapack_cpp/lartg_cpp.cpp
rename to LAPACK_CPP/src/lapack_cpp/lartg.cpp
diff --git a/LAPACK_CPP/src/lapack_cpp/rot_cpp.cpp b/LAPACK_CPP/src/lapack_cpp/rot.cpp
similarity index 100%
rename from LAPACK_CPP/src/lapack_cpp/rot_cpp.cpp
rename to LAPACK_CPP/src/lapack_cpp/rot.cpp

From 0d9803a29c157961e29f2ddf530ba008a79502d7 Mon Sep 17 00:00:00 2001
From: thijssteel <thijs.steel@kuleuven.be>
Date: Wed, 6 Mar 2024 11:58:41 +0100
Subject: [PATCH 06/10] add c wrapper to cpp rot

---
 LAPACK_CPP/Makefile             | 25 ++++++-------
 LAPACK_CPP/src/lapack_c/rot.cpp | 65 +++++++++++++++++++++++++++++++++
 LAPACK_CPP/src/lapack_c/rot.f90 |  4 +-
 3 files changed, 79 insertions(+), 15 deletions(-)
 create mode 100644 LAPACK_CPP/src/lapack_c/rot.cpp

diff --git a/LAPACK_CPP/Makefile b/LAPACK_CPP/Makefile
index 46939582e..3badab025 100644
--- a/LAPACK_CPP/Makefile
+++ b/LAPACK_CPP/Makefile
@@ -14,30 +14,27 @@ HEADERS = $(wildcard include/lapack_c/*.h) \
 		  $(wildcard include/lapack_cpp/*/*.hpp) \
 		  $(wildcard include/lapack_cpp/*/*/*.hpp)
 
-OBJFILES = src/lapack_cpp/rot.o \
-		   src/lapack_cpp/lartg.o \
-		   src/lapack_c/rot.o \
-		   src/lapack_c/lartg.o \
-		   src/lapack_c/lasr3.o \
-		   src/lapack_fortran/slasr3.o \
-		   src/lapack_fortran/dlasr3.o \
-		   src/lapack_fortran/clasr3.o \
-		   src/lapack_fortran/zlasr3.o \
+SRCFILES = $(wildcard src/lapack_c/*.cpp) \
+		   $(wildcard src/lapack_c/*.f90) \
+		   $(wildcard src/lapack_cpp/*.cpp) \
+		   $(wildcard src/lapack_fortran/*.f90)
+
+OBJFILES = $(patsubst %.f90, %_f90.o ,$(patsubst %.cpp,%_cpp.o,$(SRCFILES)))
 
 # Fortran files
-src/lapack_fortran/%.o: src/lapack_fortran/%.f90
+src/lapack_fortran/%_f90.o: src/lapack_fortran/%.f90
 	$(FC) $(FFLAGS) -cpp -c -o $@ $<
 
 # C files
 
-src/lapack_c/%.o: src/lapack_c/%.cpp $(HEADERS)
+src/lapack_c/%_cpp.o: src/lapack_c/%.cpp $(HEADERS)
 	$(CXX) $(CXXFLAGS) -I./include -c -o $@ $<
 
-src/lapack_c/%.o: src/lapack_c/%.f90
-	$(FC) $(FFLAGS) -c -o $@ $<
+src/lapack_c/%_f90.o: src/lapack_c/%.f90
+	$(FC) $(FFLAGS) -cpp -c -o $@ $<
 
 # C++ files
-src/lapack_cpp/%.o: src/lapack_cpp/%.cpp $(HEADERS)
+src/lapack_cpp/%_cpp.o: src/lapack_cpp/%.cpp $(HEADERS)
 	$(CXX) $(CXXFLAGS) -I./include -c -o $@ $<
 
 # Test files
diff --git a/LAPACK_CPP/src/lapack_c/rot.cpp b/LAPACK_CPP/src/lapack_c/rot.cpp
new file mode 100644
index 000000000..f982276a2
--- /dev/null
+++ b/LAPACK_CPP/src/lapack_c/rot.cpp
@@ -0,0 +1,65 @@
+#ifndef USE_FORTRAN_BLAS
+#include "lapack_c/rot.h"
+#include "lapack_cpp/base.hpp"
+#include "lapack_cpp/blas/rot.hpp"
+
+using namespace lapack_cpp;
+
+// Actual definitions
+extern "C"
+{
+
+    void lapack_c_drot(lapack_idx n,
+                       double *x,
+                       lapack_idx incx,
+                       double *y,
+                       lapack_idx incy,
+                       double c,
+                       double s)
+    {
+        Vector<double, lapack_idx> x_(n, x, incx);
+        Vector<double, lapack_idx> y_(n, y, incy);
+        rot(x_, y_, c, s);
+    }
+
+    void lapack_c_srot(lapack_idx n,
+                       float *x,
+                       lapack_idx incx,
+                       float *y,
+                       lapack_idx incy,
+                       float c,
+                       float s)
+    {
+        Vector<float, lapack_idx> x_(n, x, incx);
+        Vector<float, lapack_idx> y_(n, y, incy);
+        rot(x_, y_, c, s);
+    }
+
+    void lapack_c_crot(lapack_idx n,
+                       lapack_float_complex *x,
+                       lapack_idx incx,
+                       lapack_float_complex *y,
+                       lapack_idx incy,
+                       float c,
+                       lapack_float_complex s)
+    {
+        Vector<lapack_float_complex, lapack_idx> x_(n, x, incx);
+        Vector<lapack_float_complex, lapack_idx> y_(n, y, incy);
+        rot(x_, y_, c, s);
+    }
+
+    void lapack_c_zrot(lapack_idx n,
+                       lapack_double_complex *x,
+                       lapack_idx incx,
+                       lapack_double_complex *y,
+                       lapack_idx incy,
+                       double c,
+                       lapack_double_complex s)
+    {
+        Vector<lapack_double_complex, lapack_idx> x_(n, x, incx);
+        Vector<lapack_double_complex, lapack_idx> y_(n, y, incy);
+        rot(x_, y_, c, s);
+    }
+
+} // extern "C"
+#endif
\ No newline at end of file
diff --git a/LAPACK_CPP/src/lapack_c/rot.f90 b/LAPACK_CPP/src/lapack_c/rot.f90
index 6876c9036..60df52f41 100644
--- a/LAPACK_CPP/src/lapack_c/rot.f90
+++ b/LAPACK_CPP/src/lapack_c/rot.f90
@@ -1,3 +1,4 @@
+#ifdef USE_FORTRAN_BLAS
 subroutine lapack_c_drot(n, dx ,incx, dy, incy, c, s) bind(c, name='lapack_c_drot')
     use iso_c_binding
     implicit none
@@ -54,4 +55,5 @@ subroutine lapack_c_zrot(n, dx ,incx, dy, incy, c, s) bind(c, name='lapack_c_zro
     ! Forward call to fortran implementation
     call zrot( n, dx ,incx, dy, incy, c, s )
 
-end subroutine lapack_c_zrot
\ No newline at end of file
+end subroutine lapack_c_zrot
+#endif
\ No newline at end of file

From a4c4a4eb5261f37149a8b5216fc36a4b47298e68 Mon Sep 17 00:00:00 2001
From: thijssteel <thijs.steel@kuleuven.be>
Date: Wed, 6 Mar 2024 14:17:19 +0100
Subject: [PATCH 07/10] add steqr3

---
 .gitignore                                    |   3 +
 LAPACK_CPP/include/lapack_c.h                 |   4 +-
 LAPACK_CPP/include/lapack_c/laev2.h           |  19 +
 LAPACK_CPP/include/lapack_c/lasrt.h           |  19 +
 LAPACK_CPP/include/lapack_cpp/base/enums.hpp  |  15 +
 .../include/lapack_cpp/lapack/laev2.hpp       |  55 +++
 .../include/lapack_cpp/lapack/lasr3.hpp       |   4 +-
 .../include/lapack_cpp/lapack/lasrt.hpp       |  17 +
 .../include/lapack_cpp/lapack/steqr3.hpp      | 428 ++++++++++++++++++
 LAPACK_CPP/src/lapack_c/laev2.f90             |  25 +
 LAPACK_CPP/src/lapack_c/lasrt.f90             |  37 ++
 LAPACK_CPP/src/lapack_cpp/laev2.cpp           |  23 +
 LAPACK_CPP/src/lapack_cpp/lasrt.cpp           |  51 +++
 13 files changed, 697 insertions(+), 3 deletions(-)
 create mode 100644 LAPACK_CPP/include/lapack_c/laev2.h
 create mode 100644 LAPACK_CPP/include/lapack_c/lasrt.h
 create mode 100644 LAPACK_CPP/include/lapack_cpp/lapack/laev2.hpp
 create mode 100644 LAPACK_CPP/include/lapack_cpp/lapack/lasrt.hpp
 create mode 100644 LAPACK_CPP/include/lapack_cpp/lapack/steqr3.hpp
 create mode 100644 LAPACK_CPP/src/lapack_c/laev2.f90
 create mode 100644 LAPACK_CPP/src/lapack_c/lasrt.f90
 create mode 100644 LAPACK_CPP/src/lapack_cpp/laev2.cpp
 create mode 100644 LAPACK_CPP/src/lapack_cpp/lasrt.cpp

diff --git a/.gitignore b/.gitignore
index 568e67b33..361dc7bd8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -43,3 +43,6 @@ build*
 DOCS/man
 DOCS/explore-html
 output_err
+
+# Editor config files
+.vscode/
\ No newline at end of file
diff --git a/LAPACK_CPP/include/lapack_c.h b/LAPACK_CPP/include/lapack_c.h
index 4212a73f4..f6ad2c42e 100644
--- a/LAPACK_CPP/include/lapack_c.h
+++ b/LAPACK_CPP/include/lapack_c.h
@@ -8,6 +8,8 @@
 
 // LAPACK
 #include "lapack_c/lartg.h"
-
+#include "lapack_c/lasrt.h"
+#include "lapack_c/laev2.h"
+#include "lapack_c/lasrt.h"
 
 #endif
\ No newline at end of file
diff --git a/LAPACK_CPP/include/lapack_c/laev2.h b/LAPACK_CPP/include/lapack_c/laev2.h
new file mode 100644
index 000000000..2cb5a6ad4
--- /dev/null
+++ b/LAPACK_CPP/include/lapack_c/laev2.h
@@ -0,0 +1,19 @@
+#ifndef LAPACK_C_LAEV2_HPP
+#define LAPACK_C_LAEV2_HPP
+
+#include "lapack_c/util.h"
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif // __cplusplus
+
+    void lapack_c_slaev2(float a, float b, float c, float *rt1, float *rt2, float *cs1, float *sn1);
+
+    void lapack_c_dlaev2(double a, double b, double c, double *rt1, double *rt2, double *cs1, double *sn1);
+
+#ifdef __cplusplus
+}
+#endif // __cplusplus
+
+#endif // LAPACK_C_LAEV2_HPP
\ No newline at end of file
diff --git a/LAPACK_CPP/include/lapack_c/lasrt.h b/LAPACK_CPP/include/lapack_c/lasrt.h
new file mode 100644
index 000000000..6cf8a60d6
--- /dev/null
+++ b/LAPACK_CPP/include/lapack_c/lasrt.h
@@ -0,0 +1,19 @@
+#ifndef LAPACK_C_LASRT_HPP
+#define LAPACK_C_LASRT_HPP
+
+#include "lapack_c/util.h"
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif // __cplusplus
+
+    void lapack_c_slasrt(char id, lapack_idx n, float* d, lapack_idx *info);
+
+    void lapack_c_dlasrt(char id, lapack_idx n, double* d, lapack_idx *info);
+
+#ifdef __cplusplus
+}
+#endif // __cplusplus
+
+#endif // LAPACK_C_LASRT_HPP
\ No newline at end of file
diff --git a/LAPACK_CPP/include/lapack_cpp/base/enums.hpp b/LAPACK_CPP/include/lapack_cpp/base/enums.hpp
index 6fd09bb0e..6bd6c5365 100644
--- a/LAPACK_CPP/include/lapack_cpp/base/enums.hpp
+++ b/LAPACK_CPP/include/lapack_cpp/base/enums.hpp
@@ -84,6 +84,21 @@ inline constexpr Direction char2direct(char t)
     }
 }
 
+enum class IncDec : char { Increasing = 'I', Decreasing = 'D' };
+
+inline constexpr IncDec char2incdec(char t)
+{
+    switch (t) {
+        case 'I':
+            return IncDec::Increasing;
+        case 'D':
+            return IncDec::Decreasing;
+        default:
+            assert(false);
+            return IncDec::Increasing;
+    }
+}
+
 }  // namespace lapack_cpp
 
 #endif  // LAPACK_CPP_ENUMP_HPP
\ No newline at end of file
diff --git a/LAPACK_CPP/include/lapack_cpp/lapack/laev2.hpp b/LAPACK_CPP/include/lapack_cpp/lapack/laev2.hpp
new file mode 100644
index 000000000..72a105f8c
--- /dev/null
+++ b/LAPACK_CPP/include/lapack_cpp/lapack/laev2.hpp
@@ -0,0 +1,55 @@
+#ifndef LAPACK_CPP_LAEV2_HPP
+#define LAPACK_CPP_LAEV2_HPP
+
+#include "lapack_cpp/base.hpp"
+#include "lapack_cpp/utils.hpp"
+
+namespace lapack_cpp {
+
+/** Computes the eigenvalues and eigenvector of a real symmetric 2x2 matrix A
+ *  [ a b ]
+ *  [ b c ]
+ *  On exit, the decomposition satisfies:
+ *  [ cs  sn ] [ a b ] [ cs -sn ] = [ s1  0  ]
+ *  [ -sn cs ] [ b c ] [ sn  cs ]   [ 0   s2 ]
+ *  where cs*cs + sn*sn = 1.
+ *
+ * @param[in] a
+ *      Element (0,0) of A.
+ * @param[in] b
+ *      Element (0,1) and (1,0) of A.
+ * @param[in] c
+ *      Element (1,1) of A.
+ * @param[out] s1
+ *      The eigenvalue of A with the largest absolute value.
+ * @param[out] s2
+ *      The eigenvalue of A with the smallest absolute value.
+ * @param[out] cs
+ *      The cosine of the rotation matrix.
+ * @param[out] sn
+ *      The sine of the rotation matrix.
+ *
+ * \verbatim
+ *  s1 is accurate to a few ulps barring over/underflow.
+ *
+ *  s2 may be inaccurate if there is massive cancellation in the
+ *  determinant a*c-b*b; higher precision or correctly rounded or
+ *  correctly truncated arithmetic would be needed to compute s2
+ *  accurately in all cases.
+ *
+ *  cs and sn are accurate to a few ulps barring over/underflow.
+ *
+ *  Overflow is possible only if s1 is within a factor of 5 of overflow.
+ *  Underflow is harmless if the input data is 0 or exceeds
+ *     underflow_threshold / macheps.
+ * \endverbatim
+ *
+ *
+ * @ingroup auxiliary
+ */
+template <typename T>
+void laev2(T a, T b, T c, T& s1, T& s2, T& cs, T& sn);
+
+}  // namespace lapack_cpp
+
+#endif  // LAPACK_CPP_LAEV2_HPP
\ No newline at end of file
diff --git a/LAPACK_CPP/include/lapack_cpp/lapack/lasr3.hpp b/LAPACK_CPP/include/lapack_cpp/lapack/lasr3.hpp
index 535ca4ce6..a1ecfa66f 100644
--- a/LAPACK_CPP/include/lapack_cpp/lapack/lasr3.hpp
+++ b/LAPACK_CPP/include/lapack_cpp/lapack/lasr3.hpp
@@ -51,7 +51,7 @@ void lasr3(Side side,
            ConstMatrix<real_t<T>, layout, idx_t> C,
            ConstMatrix<TS, layout, idx_t> S,
            Matrix<T, layout, idx_t> A,
-           MemoryBlock<T, idx_t, true> work);
+           MemoryBlock<T, idx_t, true>& work);
 
 /**
  * Workspace query for lasr3.
@@ -698,7 +698,7 @@ void lasr3(Side side,
            ConstMatrix<real_t<T>, layout, idx_t> C,
            ConstMatrix<TS, layout, idx_t> S,
            Matrix<T, layout, idx_t> A,
-           MemoryBlock<T, idx_t, true> work)
+           MemoryBlock<T, idx_t, true>& work)
 {
     const idx_t m = A.num_rows();
     const idx_t n = A.num_columns();
diff --git a/LAPACK_CPP/include/lapack_cpp/lapack/lasrt.hpp b/LAPACK_CPP/include/lapack_cpp/lapack/lasrt.hpp
new file mode 100644
index 000000000..d81fb99bb
--- /dev/null
+++ b/LAPACK_CPP/include/lapack_cpp/lapack/lasrt.hpp
@@ -0,0 +1,17 @@
+#ifndef LAPACK_CPP_LASRT_HPP
+#define LAPACK_CPP_LASRT_HPP
+
+#include "lapack_cpp/base.hpp"
+#include "lapack_cpp/utils.hpp"
+
+namespace lapack_cpp {
+
+/**
+ * Sort the numbers in an array.
+ */
+template <typename T, typename idx_t>
+void lasrt(IncDec id, Vector<T, idx_t> d);
+
+}  // namespace lapack_cpp
+
+#endif  // LAPACK_CPP_LASRT_HPP
\ No newline at end of file
diff --git a/LAPACK_CPP/include/lapack_cpp/lapack/steqr3.hpp b/LAPACK_CPP/include/lapack_cpp/lapack/steqr3.hpp
new file mode 100644
index 000000000..d6420dd6a
--- /dev/null
+++ b/LAPACK_CPP/include/lapack_cpp/lapack/steqr3.hpp
@@ -0,0 +1,428 @@
+#ifndef LAPACK_CPP_STEQR3_HPP
+#define LAPACK_CPP_STEQR3_HPP
+
+#include "lapack_cpp/base.hpp"
+#include "lapack_cpp/lapack/lasr3.hpp"
+#include "lapack_cpp/lapack/lartg.hpp"
+#include "lapack_cpp/lapack/lasrt.hpp"
+#include "lapack_cpp/lapack/laev2.hpp"
+
+namespace lapack_cpp
+{
+
+    /**
+     * STEQR3 computes all eigenvalues and, optionally, eigenvectors of a
+     * hermitian tridiagonal matrix using the implicit QL or QR method.
+     * The eigenvectors of a full or band hermitian matrix can also be found
+     * if SYTRD or SPTRD or SBTRD has been used to reduce this matrix to
+     * tridiagonal form.
+     *
+     * STEQR3 is a variant of STEQR that uses lasr3 to efficiently
+     * accumulate the rotations into the eigenvector matrix.
+     *
+     * @return  0 if success
+     *
+     * @param[in] want_z bool
+     *
+     * @param[in,out] d Real vector of length n.
+     *      On entry, diagonal elements of the bidiagonal matrix B.
+     *      On exit, the singular values of B in decreasing order.
+     *
+     * @param[in,out] e Real vector of length n-1.
+     *      On entry, off-diagonal elements of the bidiagonal matrix B.
+     *      On exit, the singular values of B in decreasing order.
+     *
+     * @param[in,out] Z n-by-m matrix.
+     *      On entry, the n-by-n unitary matrix used in the reduction
+     *      to tridiagonal form.
+     *      On exit, if info = 0, and want_z=true then Z contains the
+     *      orthonormal eigenvectors of the original Hermitian matrix.
+     *
+     * @ingroup computational
+     */
+    template <typename T,
+              Layout layout,
+              typename idx_t,
+              bool aligned>
+    int steqr3(
+        bool want_z,
+        Vector<real_t<T>, idx_t> d,
+        Vector<real_t<T>, idx_t> e,
+        Matrix<T, layout, idx_t> Z,
+        MemoryBlock<real_t<T>, idx_t, aligned> &rwork,
+        MemoryBlock<T, idx_t, aligned> &work)
+    {
+        using T_real = real_t<T>;
+
+        // constants
+        const T_real two(2);
+        const T_real one(1);
+        const T_real zero(0);
+        const idx_t n = size(d);
+
+        // Amount of rotation sequences to generate before applying it.
+        const auto nb = idx_t{32};
+
+        // Quick return if possible
+        if (n == 0)
+            return 0;
+        if (n == 1)
+        {
+            if (want_z)
+                Z(0, 0) = one;
+            return 0;
+        }
+
+        // Determine the unit roundoff and over/underflow thresholds.
+        const T_real eps = ulp<T_real>();
+        const T_real eps2 = eps * eps;
+        const T_real safmin = safe_min<T_real>();
+
+        // Compute the eigenvalues and eigenvectors of the tridiagonal
+        // matrix.
+        const idx_t itmax = 30 * n;
+
+        // istart and istop determine the active block
+        idx_t istart = 0;
+        idx_t istop = n;
+
+        // Keep track of previous istart and istop to know when to change direction
+        idx_t istart_old = -1;
+        idx_t istop_old = -1;
+
+        // Keep track of the delayed rotation sequences
+        idx_t i_block = 0;
+
+        // C and S are used to store the rotation sequences
+        Matrix<T_real, layout, idx_t> C(n - 1, nb, rwork);
+        auto rwork2 = rwork.remainder(n - 1, nb);
+        Matrix<T_real, layout, idx_t> S(n - 1, nb, rwork2);
+
+        // Initialize C and S to identity rotations
+        for (idx_t j = 0; j < nb; ++j)
+        {
+            for (idx_t i = 0; i < n - 1; ++i)
+            {
+                C(i, j) = one;
+                S(i, j) = zero;
+            }
+        }
+
+        // Direction to chase the bulge
+        // This variable is reevaluated for every new subblock
+        Direction direction = d[0] > d[n - 1] ? Direction::Forward : Direction::Backward;
+
+        // Main loop
+        for (idx_t iter = 0; iter < itmax; iter++)
+        {
+            // If we have saved up enough rotations, apply them
+            if (want_z and (i_block >= nb or iter == itmax or istop <= 1))
+            {
+                idx_t i_block2 = min<idx_t>(i_block + 1, nb);
+
+                // Find smallest index where rotation is not identity
+                idx_t i_start_block = n - 1;
+                for (idx_t i = 0; i < i_block2; ++i)
+                {
+                    for (idx_t j = 0; j < i_start_block; ++j)
+                        if (C(j, i) != one or S(j, i) != zero)
+                        {
+                            i_start_block = j;
+                            break;
+                        }
+                }
+
+                // Find largest index where rotation is not identity
+                idx_t i_stop_block = 0;
+                for (idx_t i = 0; i < i_block2; ++i)
+                {
+                    for (idx_t j2 = n - 1; j2 > i_stop_block; --j2)
+                    {
+                        idx_t j = j2 - 1;
+                        if (C(j, i) != one or S(j, i) != zero)
+                        {
+                            i_stop_block = j;
+                            break;
+                        }
+                    }
+                }
+
+                if (i_start_block < i_stop_block + 1)
+                {
+                    auto C2 = C.submatrix(i_start_block, i_stop_block + 1, 0, i_block2);
+                    auto S2 = C.submatrix(i_start_block, i_stop_block + 1, 0, i_block2);
+                    auto Z2 = Z.submatrix(0, n, i_start_block, i_stop_block + 2);
+                    lasr3(Side::Right, direction, C2, S2, Z2, work);
+                }
+                // Reset block
+                i_block = 0;
+
+                // Initialize C and S to identity rotations
+                for (idx_t j = 0; j < nb; ++j)
+                {
+                    for (idx_t i = 0; i < n - 1; ++i)
+                    {
+                        C(i, j) = one;
+                        S(i, j) = zero;
+                    }
+                }
+            }
+            if (iter == itmax)
+            {
+                // The QR algorithm failed to converge, return with error.
+                return istop;
+            }
+
+            if (istop <= 1)
+            {
+                // All eigenvalues have been found, exit and return 0.
+                break;
+            }
+
+            // Find active block
+            for (idx_t i = istop - 1; i > istart; --i)
+            {
+                if ((e[i - 1] * e[i - 1]) <=
+                    (eps2 * abs(d[i - 1])) * abs(d[i]) + safmin)
+                {
+                    e[i - 1] = zero;
+                    istart = i;
+                    break;
+                }
+            }
+
+            // An eigenvalue has split off, reduce istop and start the loop again
+            if (istart == istop - 1)
+            {
+                istop = istop - 1;
+                istart = 0;
+                continue;
+            }
+
+            // A 2x2 block has split off, handle separately
+            if (istart + 1 == istop - 1)
+            {
+                T_real s1, s2;
+                if (want_z)
+                {
+                    T_real cs, sn;
+                    laev2(d[istart], e[istart], d[istart + 1], s1, s2, cs, sn);
+
+                    // Store rotation
+                    C(istart, i_block) = cs;
+                    S(istart, i_block) = sn;
+                    // Normally, we would increment i_block here, but we do not
+                    // because the block has been deflated and should not interfere
+                    // with the next sequence.
+                }
+                else
+                {
+                    lae2(d[istart], e[istart], d[istart + 1], s1, s2);
+                }
+                d[istart] = s1;
+                d[istart + 1] = s2;
+                e[istart] = zero;
+
+                istop = istop - 2;
+                istart = 0;
+                continue;
+            }
+
+            // Choose between QL and QR iteration
+            if (istart >= istop_old or istop <= istart_old)
+            {
+                Direction direction_new;
+                // We prefer to keep the current direction, because switching
+                // direction forces us to apply the rotations in the block, which
+                // may be inefficient if the block is small.
+                if (direction == Direction::Forward)
+                {
+                    direction_new = 10. * abs(d[istart]) > abs(d[istop - 1]) ? Direction::Forward : Direction::Backward;
+                }
+                else
+                {
+                    direction_new =
+                        abs(d[istart]) > 10. * abs(d[istop - 1]) ? Direction::Forward : Direction::Backward;
+                }
+
+                if (direction_new != direction)
+                {
+                    // We don't want different directions in our saved rotation matrices
+                    // So we apply all the current rotations and reset the block
+                    idx_t i_block2 = min<idx_t>(i_block + 1, nb);
+
+                    // Find smallest index where rotation is not identity
+                    idx_t i_start_block = n - 1;
+                    for (idx_t i = 0; i < i_block2; ++i)
+                    {
+                        for (idx_t j = 0; j < i_start_block; ++j)
+                            if (C(j, i) != one or S(j, i) != zero)
+                            {
+                                i_start_block = j;
+                                break;
+                            }
+                    }
+
+                    // Find largest index where rotation is not identity
+                    idx_t i_stop_block = 0;
+                    for (idx_t i = 0; i < i_block2; ++i)
+                    {
+                        for (idx_t j2 = n - 1; j2 > i_stop_block; --j2)
+                        {
+                            idx_t j = j2 - 1;
+                            if (C(j, i) != one or S(j, i) != zero)
+                            {
+                                i_stop_block = j;
+                                break;
+                            }
+                        }
+                    }
+
+                    if (i_start_block < i_stop_block + 1)
+                    {
+                        auto C2 = C.submatrix(i_start_block, i_stop_block + 1, 0, i_block2);
+                        auto S2 = S.submatrix(i_start_block, i_stop_block + 1, 0, i_block2);
+                        auto Z2 = Z.submatrix(0, n, i_start_block, i_stop_block + 2);
+
+                        lasr3(Side::Right, direction, C2, S2, Z2, work);
+                    }
+                    // Reset block
+                    i_block = 0;
+
+                    // Initialize C and S to identity rotations
+                    for (idx_t j = 0; j < nb; ++j)
+                    {
+                        for (idx_t i = 0; i < n - 1; ++i)
+                        {
+                            C(i, j) = one;
+                            S(i, j) = zero;
+                        }
+                    }
+                }
+                direction = direction_new;
+            }
+            istart_old = istart;
+            istop_old = istop;
+
+            if (direction == Direction::Forward)
+            {
+                // QR iteration
+
+                // Form shift using last 2x2 block of the active matrix
+                T_real p = d[istop - 1];
+                T_real g = (d[istop - 2] - p) / (two * e[istop - 2]);
+                T_real r = lapy2(g, one);
+                g = d[istart] - p + e[istop - 2] / (T_real)(g + (sgn(g) * r));
+
+                T_real s = one;
+                T_real c = one;
+                p = zero;
+
+                // Chase bulge from top to bottom
+                for (idx_t i = istart; i < istop - 1; ++i)
+                {
+                    T_real f = s * e[i];
+                    T_real b = c * e[i];
+                    lartg(g, f, c, s, r);
+                    if (i != istart)
+                        e[i - 1] = r;
+                    g = d[i] - p;
+                    r = (d[i + 1] - g) * s + two * c * b;
+                    p = s * r;
+                    d[i] = g + p;
+                    g = c * r - b;
+                    // If eigenvalues are desired, then apply rotations
+                    if (want_z)
+                    {
+                        // Store rotation for later
+                        C(i, i_block) = c;
+                        S(i, i_block) = s;
+                    }
+                }
+                d[istop - 1] = d[istop - 1] - p;
+                e[istop - 2] = g;
+            }
+            else
+            {
+                // QL iteration
+
+                // Form shift using last 2x2 block of the active matrix
+                T_real p = d[istart];
+                T_real g = (d[istart + 1] - p) / (two * e[istart]);
+                T_real r = lapy2(g, one);
+                g = d[istop - 1] - p + e[istart] / (T_real)(g + (sgn(g) * r));
+
+                T_real s = one;
+                T_real c = one;
+                p = zero;
+
+                // Chase bulge from bottom to top
+                for (idx_t i = istop - 1; i > istart; --i)
+                {
+                    T_real f = s * e[i - 1];
+                    T_real b = c * e[i - 1];
+                    lartg(g, f, c, s, r);
+                    if (i != istop - 1)
+                        e[i] = r;
+                    g = d[i] - p;
+                    r = (d[i - 1] - g) * s + two * c * b;
+                    p = s * r;
+                    d[i] = g + p;
+                    g = c * r - b;
+                    // If eigenvalues are desired, then apply rotations
+                    if (want_z)
+                    {
+                        // Store rotation for later
+                        C(i - 1, i_block) = c;
+                        S(i - 1, i_block) = -s;
+                    }
+                }
+                d[istart] = d[istart] - p;
+                e[istart] = g;
+            }
+            i_block++;
+        }
+
+        // Order eigenvalues and eigenvectors
+        if (!want_z)
+        {
+            // Order eigenvalues
+            lasrt(IncDec::Increasing, d);
+        }
+        else
+        {
+            // Use selection sort to minize swaps of eigenvectors
+            for (idx_t i = 0; i < n - 1; ++i)
+            {
+                idx_t k = i;
+                T_real p = d[i];
+                for (idx_t j = i + 1; j < n; ++j)
+                {
+                    if (d[j] < p)
+                    {
+                        k = j;
+                        p = d[j];
+                    }
+                }
+                if (k != i)
+                {
+                    d[k] = d[i];
+                    d[i] = p;
+                    auto z1 = col(Z, i);
+                    auto z2 = col(Z, k);
+                    for (idx_t j = 0; j < n; ++j)
+                    {
+                        auto temp = z1[j];
+                        z1[j] = z2[j];
+                        z2[j] = temp;
+                    }
+                }
+            }
+        }
+
+        return 0;
+    }
+
+}
+
+#endif
\ No newline at end of file
diff --git a/LAPACK_CPP/src/lapack_c/laev2.f90 b/LAPACK_CPP/src/lapack_c/laev2.f90
new file mode 100644
index 000000000..0fc787ff3
--- /dev/null
+++ b/LAPACK_CPP/src/lapack_c/laev2.f90
@@ -0,0 +1,25 @@
+subroutine lapack_c_slaev2(a, b, c, rt1, rt2, cs1, sn1) bind(c, name='lapack_c_slaev2')
+    use iso_c_binding, only: c_float
+    implicit none
+    real(kind=c_float), intent(in), value :: a, b, c
+    real(kind=c_float), intent(out) :: rt1, rt2, cs1, sn1
+
+    external slaev2
+
+    ! Forward call to fortran implementation
+    call slaev2( a, b, c, rt1, rt2, cs1, sn1)
+
+end subroutine lapack_c_slaev2
+
+subroutine lapack_c_dlaev2(a, b, c, rt1, rt2, cs1, sn1) bind(c, name='lapack_c_dlaev2')
+    use iso_c_binding, only: c_double
+    implicit none
+    real(kind=c_double), intent(in), value :: a, b, c
+    real(kind=c_double), intent(out) :: rt1, rt2, cs1, sn1
+
+    external dlaev2
+
+    ! Forward call to fortran implementation
+    call dlaev2( a, b, c, rt1, rt2, cs1, sn1)
+
+end subroutine lapack_c_dlaev2
\ No newline at end of file
diff --git a/LAPACK_CPP/src/lapack_c/lasrt.f90 b/LAPACK_CPP/src/lapack_c/lasrt.f90
new file mode 100644
index 000000000..560a2990b
--- /dev/null
+++ b/LAPACK_CPP/src/lapack_c/lasrt.f90
@@ -0,0 +1,37 @@
+subroutine lapack_c_slasrt(id, n, d, info) bind(c, name='lapack_c_slasrt')
+    use iso_c_binding, only: c_char, c_int, c_float
+    implicit none
+    character(kind=c_char), intent(in), value :: id
+    integer(kind=c_int), intent(in), value :: n
+    real(kind=c_float), intent(inout) :: d(*)
+    integer(kind=c_int), intent(out) :: info
+
+    character :: id_fortran
+
+    external slasrt
+
+    ! Convert the character to fortran
+    id_fortran = id
+    ! Forward call to fortran implementation
+    call slasrt( id_fortran, n, d, info)
+
+end subroutine lapack_c_slasrt
+
+subroutine lapack_c_dlasrt(id, n, d, info) bind(c, name='lapack_c_dlasrt')
+    use iso_c_binding, only: c_char, c_int, c_double
+    implicit none
+    character(kind=c_char), intent(in), value :: id
+    integer(kind=c_int), intent(in), value :: n
+    real(kind=c_double), intent(inout) :: d(*)
+    integer(kind=c_int), intent(out) :: info
+
+    character :: id_fortran
+
+    external dlasrt
+
+    ! Convert the character to fortran
+    id_fortran = id
+    ! Forward call to fortran implementation
+    call dlasrt( id_fortran, n, d, info)
+
+end subroutine lapack_c_dlasrt
\ No newline at end of file
diff --git a/LAPACK_CPP/src/lapack_cpp/laev2.cpp b/LAPACK_CPP/src/lapack_cpp/laev2.cpp
new file mode 100644
index 000000000..2b74e53bc
--- /dev/null
+++ b/LAPACK_CPP/src/lapack_cpp/laev2.cpp
@@ -0,0 +1,23 @@
+#include <cassert>
+#include <complex>
+#include <type_traits>
+
+#include "lapack_c.h"
+#include "lapack_cpp/base.hpp"
+#include "lapack_cpp/lapack/laev2.hpp"
+
+namespace lapack_cpp {
+
+template <>
+void laev2<float>(float a, float b, float c, float& s1, float& s2, float& cs, float& sn)
+{
+    lapack_c_slaev2(a, b, c, &s1, &s2, &cs, &sn);
+}
+
+template <>
+void laev2<double>(double a, double b, double c, double& s1, double& s2, double& cs, double& sn)
+{
+    lapack_c_dlaev2(a, b, c, &s1, &s2, &cs, &sn);
+}
+
+}  // namespace lapack_cpp
\ No newline at end of file
diff --git a/LAPACK_CPP/src/lapack_cpp/lasrt.cpp b/LAPACK_CPP/src/lapack_cpp/lasrt.cpp
new file mode 100644
index 000000000..308059887
--- /dev/null
+++ b/LAPACK_CPP/src/lapack_cpp/lasrt.cpp
@@ -0,0 +1,51 @@
+#include <cassert>
+#include <complex>
+#include <type_traits>
+
+#include "lapack_c.h"
+#include "lapack_cpp/base.hpp"
+#include "lapack_cpp/lapack/lasrt.hpp"
+
+namespace lapack_cpp {
+
+template <>
+void lasrt<float, int>(IncDec id, Vector<float, int> d)
+{
+    assert(d.size() >= 0);
+    // fortran code only supports stride 1
+    assert(d.stride() == 1);
+    lapack_idx info;
+    lapack_c_slasrt((char) id, d.size(), d.ptr(), &info);
+}
+
+template <>
+void lasrt<double, int>(IncDec id, Vector<double, int> d)
+{
+    assert(d.size() >= 0);
+    // fortran code only supports stride 1
+    assert(d.stride() == 1);
+    lapack_idx info;
+    lapack_c_dlasrt((char) id, d.size(), d.ptr(), &info);
+}
+
+template <>
+void lasrt<float, int64_t>(IncDec id, Vector<float, int64_t> d)
+{
+    assert(d.size() >= 0);
+    // fortran code only supports stride 1
+    assert(d.stride() == 1);
+    lapack_idx info;
+    lapack_c_slasrt((char) id, d.size(), d.ptr(), &info);
+}
+
+template <>
+void lasrt<double, int64_t>(IncDec id, Vector<double, int64_t> d)
+{
+    assert(d.size() >= 0);
+    // fortran code only supports stride 1
+    assert(d.stride() == 1);
+    lapack_idx info;
+    lapack_c_dlasrt((char) id, d.size(), d.ptr(), &info);
+}
+
+}  // namespace lapack_cpp
\ No newline at end of file

From b44c64faefb637cc24a56033065d9a104e32b499 Mon Sep 17 00:00:00 2001
From: thijssteel <thijs.steel@kuleuven.be>
Date: Wed, 6 Mar 2024 15:58:48 +0100
Subject: [PATCH 08/10] add steqr3

---
 LAPACK_CPP/Makefile                           |  9 +-
 LAPACK_CPP/example/.gitignore                 |  3 +-
 LAPACK_CPP/example/profile_steqr3.cpp         | 82 +++++++++++++++++++
 LAPACK_CPP/include/lapack_c.h                 |  1 +
 LAPACK_CPP/include/lapack_c/lae2.h            | 19 +++++
 LAPACK_CPP/include/lapack_cpp/base/matrix.hpp | 10 +++
 LAPACK_CPP/include/lapack_cpp/base/types.hpp  | 32 ++++++++
 LAPACK_CPP/include/lapack_cpp/lapack.hpp      |  4 +
 LAPACK_CPP/include/lapack_cpp/lapack/lae2.hpp | 18 ++++
 .../include/lapack_cpp/lapack/lapy2.hpp       | 34 ++++++++
 .../include/lapack_cpp/lapack/lartg.hpp       |  2 +-
 .../include/lapack_cpp/lapack/steqr3.hpp      | 67 +++++++++++++--
 LAPACK_CPP/src/lapack_c/lae2.f90              | 25 ++++++
 LAPACK_CPP/src/lapack_cpp/lae2.cpp            | 23 ++++++
 14 files changed, 313 insertions(+), 16 deletions(-)
 create mode 100644 LAPACK_CPP/example/profile_steqr3.cpp
 create mode 100644 LAPACK_CPP/include/lapack_c/lae2.h
 create mode 100644 LAPACK_CPP/include/lapack_cpp/lapack/lae2.hpp
 create mode 100644 LAPACK_CPP/include/lapack_cpp/lapack/lapy2.hpp
 create mode 100644 LAPACK_CPP/src/lapack_c/lae2.f90
 create mode 100644 LAPACK_CPP/src/lapack_cpp/lae2.cpp

diff --git a/LAPACK_CPP/Makefile b/LAPACK_CPP/Makefile
index 3badab025..80660b628 100644
--- a/LAPACK_CPP/Makefile
+++ b/LAPACK_CPP/Makefile
@@ -1,11 +1,11 @@
 CXX=g++
-# CXXFLAGS=-std=c++17 -Wall -Wextra -pedantic -g -Og
+# CXXFLAGS=-std=c++17 -Wall -Wextra -pedantic -g
 CXXFLAGS=-std=c++17 -O3 -march=native -ffast-math -g -DNDEBUG
 
 FC=gfortran
 FFLAGS=-Wall -Wextra -pedantic -g
 
-all: ./example/test_lasr3 ./example/profile_lasr3
+all: ./example/test_lasr3 ./example/profile_lasr3 ./example/profile_steqr3 $(OBJFILES)
 
 HEADERS = $(wildcard include/lapack_c/*.h) \
 		  $(wildcard include/lapack_c/*/*.h) \
@@ -38,10 +38,7 @@ src/lapack_cpp/%_cpp.o: src/lapack_cpp/%.cpp $(HEADERS)
 	$(CXX) $(CXXFLAGS) -I./include -c -o $@ $<
 
 # Test files
-example/test_lasr3: example/test_lasr3.cpp $(OBJFILES) $(HEADERS)
-	$(CXX) $(CXXFLAGS) -o $@ $< $(OBJFILES) -I./include -lstdc++ -lgfortran -lblas -llapack
-
-example/profile_lasr3: example/profile_lasr3.cpp $(OBJFILES) $(HEADERS)
+example/%: example/%.cpp $(OBJFILES) $(HEADERS)
 	$(CXX) $(CXXFLAGS) -o $@ $< $(OBJFILES) -I./include -lstdc++ -lgfortran -lblas -llapack
 
 clean:
diff --git a/LAPACK_CPP/example/.gitignore b/LAPACK_CPP/example/.gitignore
index ed20f170e..d92df4173 100644
--- a/LAPACK_CPP/example/.gitignore
+++ b/LAPACK_CPP/example/.gitignore
@@ -1,2 +1,3 @@
 test_lasr3
-profile_lasr3
\ No newline at end of file
+profile_lasr3
+profile_steqr3
\ No newline at end of file
diff --git a/LAPACK_CPP/example/profile_steqr3.cpp b/LAPACK_CPP/example/profile_steqr3.cpp
new file mode 100644
index 000000000..d0f20db82
--- /dev/null
+++ b/LAPACK_CPP/example/profile_steqr3.cpp
@@ -0,0 +1,82 @@
+
+#include <chrono> // for high_resolution_clock
+#include <complex>
+#include <iostream>
+
+#include "lapack_cpp.hpp"
+
+using namespace lapack_cpp;
+
+template <typename T,
+          Layout layout = Layout::ColMajor,
+          typename idx_t = lapack_idx_t>
+void profile_steqr3(idx_t n)
+{
+    MemoryBlock<T, idx_t> Z_(n, n, layout);
+    Matrix<T, layout, idx_t> Z(n, n, Z_);
+
+    MemoryBlock<T, idx_t> d_(n);
+    Vector<T, idx_t> d(n, d_);
+
+    MemoryBlock<T, idx_t> e_(n - 1);
+    Vector<T, idx_t> e(n - 1, e_);
+
+    MemoryBlock<T, idx_t> d_copy_(n);
+    Vector<T, idx_t> d_copy(n, d_copy_);
+
+    MemoryBlock<T, idx_t> e_copy_(n - 1);
+    Vector<T, idx_t> e_copy(n - 1, e_copy_);
+
+    randomize(d);
+    randomize(e);
+
+    d_copy = d;
+    e_copy = e;
+
+    bool want_z = true;
+
+    MemoryBlock<T, idx_t> work(steqr3_workquery(want_z, d, e, Z));
+    MemoryBlock<real_t<T>, idx_t> rwork(steqr3_rworkquery(want_z, d, e, Z));
+
+    const idx_t n_timings = 100;
+    const idx_t n_warmup = 50;
+    std::vector<float> timings(n_timings);
+
+    for (idx_t i = 0; i < n_timings; ++i)
+    {
+        d = d_copy;
+        e = e_copy;
+        auto start = std::chrono::high_resolution_clock::now();
+        steqr3(want_z, d, e, Z, rwork, work);
+        auto end = std::chrono::high_resolution_clock::now();
+        timings[i] = std::chrono::duration<float>(end - start).count();
+        std::cout<<i << " " << timings[i] << '\r' << std::flush;
+    }
+
+    float mean = 0;
+    for (idx_t i = n_warmup; i < n_timings; ++i)
+        mean += timings[i];
+    mean /= n_timings - n_warmup;
+
+    float std_dev = 0;
+    for (idx_t i = n_warmup; i < n_timings; ++i)
+        std_dev += (timings[i] - mean) * (timings[i] - mean);
+    std_dev = std::sqrt(std_dev / (n_timings - n_warmup - 1));
+
+    std::cout << "n = " << n << ", mean time = " << mean
+              << " s"
+              << ", std dev = " << std_dev / mean * 100 << " %"
+              << std::endl;
+}
+
+int main()
+{
+    typedef lapack_idx_t idx_t;
+    typedef double T;
+
+    for (int n = 32; n <= 4000; n *= 2)
+    {
+        profile_steqr3<T, Layout::ColMajor, idx_t>(n);
+    }
+    return 0;
+}
\ No newline at end of file
diff --git a/LAPACK_CPP/include/lapack_c.h b/LAPACK_CPP/include/lapack_c.h
index f6ad2c42e..70e416e45 100644
--- a/LAPACK_CPP/include/lapack_c.h
+++ b/LAPACK_CPP/include/lapack_c.h
@@ -9,6 +9,7 @@
 // LAPACK
 #include "lapack_c/lartg.h"
 #include "lapack_c/lasrt.h"
+#include "lapack_c/lae2.h"
 #include "lapack_c/laev2.h"
 #include "lapack_c/lasrt.h"
 
diff --git a/LAPACK_CPP/include/lapack_c/lae2.h b/LAPACK_CPP/include/lapack_c/lae2.h
new file mode 100644
index 000000000..881db98ff
--- /dev/null
+++ b/LAPACK_CPP/include/lapack_c/lae2.h
@@ -0,0 +1,19 @@
+#ifndef LAPACK_C_LAE2_HPP
+#define LAPACK_C_LAE2_HPP
+
+#include "lapack_c/util.h"
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif // __cplusplus
+
+    void lapack_c_slae2(float a, float b, float c, float *rt1, float *rt2);
+
+    void lapack_c_dlae2(double a, double b, double c, double *rt1, double *rt2);
+
+#ifdef __cplusplus
+}
+#endif // __cplusplus
+
+#endif // LAPACK_C_LAE2_HPP
\ No newline at end of file
diff --git a/LAPACK_CPP/include/lapack_cpp/base/matrix.hpp b/LAPACK_CPP/include/lapack_cpp/base/matrix.hpp
index ad10aae8c..d97b18052 100644
--- a/LAPACK_CPP/include/lapack_cpp/base/matrix.hpp
+++ b/LAPACK_CPP/include/lapack_cpp/base/matrix.hpp
@@ -66,6 +66,11 @@ class Matrix {
         assert(ldim >= (layout == Layout::ColMajor ? m : n));
     }
 
+    // Constructor for a matrix of size m x n
+    Matrix(const idx_t m, const idx_t n, T* ptr)
+        : m_(m), n_(n), ldim_(layout == Layout::ColMajor ? m : n), data_(ptr)
+    {}
+
     // Copy constructor
     Matrix(const Matrix& m)
         : m_(m.num_rows()), n_(m.num_columns()), ldim_(m.ldim()), data_(m.ptr())
@@ -222,6 +227,11 @@ class ConstMatrix {
         assert(ldim >= (layout == Layout::ColMajor ? m : n));
     }
 
+    // Constructor for a matrix of size m x n
+    ConstMatrix(const idx_t m, const idx_t n, const T* ptr)
+        : m_(m), n_(n), ldim_(layout == Layout::ColMajor ? m : n), data_(ptr)
+    {}
+
     // Copy constructor
     ConstMatrix(const ConstMatrix& m)
         : m_(m.num_rows()), n_(m.num_columns()), ldim_(m.ldim()), data_(m.ptr())
diff --git a/LAPACK_CPP/include/lapack_cpp/base/types.hpp b/LAPACK_CPP/include/lapack_cpp/base/types.hpp
index 2ce546bae..3b4475c27 100644
--- a/LAPACK_CPP/include/lapack_cpp/base/types.hpp
+++ b/LAPACK_CPP/include/lapack_cpp/base/types.hpp
@@ -30,4 +30,36 @@ struct real_type_struct<std::complex<T>> {
 template <typename T>
 using real_t = typename real_type_struct<T>::type;
 
+
+template <typename T>
+constexpr real_t<T> ulp() noexcept
+{
+    return std::numeric_limits<real_t<T>>::epsilon();
+}
+
+template <typename T>
+constexpr real_t<T> safe_min() noexcept
+{
+    const int fradix = std::numeric_limits<real_t<T>>::radix;
+    const int expm = std::numeric_limits<real_t<T>>::min_exponent;
+    const int expM = std::numeric_limits<real_t<T>>::max_exponent;
+
+    return pow(fradix, real_t<T>(std::max(expm - 1, 1 - expM)));
+}
+
+/// Sign function for real numbers
+/// Note, this has the following behavior:
+/// sgn(x) = 1 if x > 0
+/// sgn(x) = -1 if x < 0
+/// sgn(0) = 1
+/// sgn(-0) = 1
+/// sgn(+Inf) = 1
+/// sgn(-Inf) = 1
+/// sgn(NaN) = -1
+template <typename T>
+constexpr T sgn(const T& val)
+{
+    return (val >= T(0)) ? T(1) : T(-1);
+}
+
 #endif  // LAPACK_CPP_BASE_TYPES_HPP
\ No newline at end of file
diff --git a/LAPACK_CPP/include/lapack_cpp/lapack.hpp b/LAPACK_CPP/include/lapack_cpp/lapack.hpp
index 31e03ce78..bb515fbf2 100644
--- a/LAPACK_CPP/include/lapack_cpp/lapack.hpp
+++ b/LAPACK_CPP/include/lapack_cpp/lapack.hpp
@@ -1,7 +1,11 @@
 #ifndef LAPACK_CPP_LAPACK_HPP
 #define LAPACK_CPP_LAPACK_HPP
 
+#include "lapack_cpp/lapack/lae2.hpp"
+#include "lapack_cpp/lapack/laev2.hpp"
 #include "lapack_cpp/lapack/lartg.hpp"
 #include "lapack_cpp/lapack/lasr3.hpp"
+#include "lapack_cpp/lapack/lasrt.hpp"
+#include "lapack_cpp/lapack/steqr3.hpp"
 
 #endif // LAPACK_CPP_LAPACK_HPP
\ No newline at end of file
diff --git a/LAPACK_CPP/include/lapack_cpp/lapack/lae2.hpp b/LAPACK_CPP/include/lapack_cpp/lapack/lae2.hpp
new file mode 100644
index 000000000..0e907d9a1
--- /dev/null
+++ b/LAPACK_CPP/include/lapack_cpp/lapack/lae2.hpp
@@ -0,0 +1,18 @@
+#ifndef LAPACK_CPP_LAE2_HPP
+#define LAPACK_CPP_LAE2_HPP
+
+#include "lapack_cpp/base.hpp"
+#include "lapack_cpp/utils.hpp"
+
+namespace lapack_cpp {
+
+/** Computes the eigenvalues of a real symmetric 2x2 matrix A
+ *  [ a b ]
+ *  [ b c ]
+ */
+template <typename T>
+void lae2(T a, T b, T c, T& s1, T& s2);
+
+}  // namespace lapack_cpp
+
+#endif  // LAPACK_CPP_LAE2_HPP
\ No newline at end of file
diff --git a/LAPACK_CPP/include/lapack_cpp/lapack/lapy2.hpp b/LAPACK_CPP/include/lapack_cpp/lapack/lapy2.hpp
new file mode 100644
index 000000000..f43de452d
--- /dev/null
+++ b/LAPACK_CPP/include/lapack_cpp/lapack/lapy2.hpp
@@ -0,0 +1,34 @@
+#ifndef LAPACK_CPP_LAPY2_HPP
+#define LAPACK_CPP_LAPY2_HPP
+
+#include "lapack_cpp/base.hpp"
+#include "lapack_cpp/utils.hpp"
+namespace lapack_cpp {
+
+/**
+ * Return \f$ \sqrt{f^2 + g^2} \f$ taking care to avoid overflow and underflow.
+ */
+template <typename T>
+T lapy2(T x, T y){
+    // constants
+    const T one(1);
+    const T zero(0);
+    const T xabs = abs(x);
+    const T yabs = abs(y);
+
+    T w, z;
+    if (xabs > yabs) {
+        w = xabs;
+        z = yabs;
+    }
+    else {
+        w = yabs;
+        z = xabs;
+    }
+
+    return (z == zero) ? w : w * sqrt(one + (z / w) * (z / w));
+}
+
+}  // namespace lapack_cpp
+
+#endif  // LAPACK_CPP_LAPY2_HPP
\ No newline at end of file
diff --git a/LAPACK_CPP/include/lapack_cpp/lapack/lartg.hpp b/LAPACK_CPP/include/lapack_cpp/lapack/lartg.hpp
index 5f5b345dd..d479abbc1 100644
--- a/LAPACK_CPP/include/lapack_cpp/lapack/lartg.hpp
+++ b/LAPACK_CPP/include/lapack_cpp/lapack/lartg.hpp
@@ -13,4 +13,4 @@ void lartg(T f, T g, real_t<T>& c, T& s, T& r);
 
 }  // namespace lapack_cpp
 
-#endif  // LAPACK_CPP_GEMV_HPP
\ No newline at end of file
+#endif  // LAPACK_CPP_LARTG_HPP
\ No newline at end of file
diff --git a/LAPACK_CPP/include/lapack_cpp/lapack/steqr3.hpp b/LAPACK_CPP/include/lapack_cpp/lapack/steqr3.hpp
index d6420dd6a..38333c512 100644
--- a/LAPACK_CPP/include/lapack_cpp/lapack/steqr3.hpp
+++ b/LAPACK_CPP/include/lapack_cpp/lapack/steqr3.hpp
@@ -6,9 +6,54 @@
 #include "lapack_cpp/lapack/lartg.hpp"
 #include "lapack_cpp/lapack/lasrt.hpp"
 #include "lapack_cpp/lapack/laev2.hpp"
+#include "lapack_cpp/lapack/lae2.hpp"
+#include "lapack_cpp/lapack/lapy2.hpp"
+
 
 namespace lapack_cpp
 {
+    template <typename T,
+              Layout layout,
+              typename idx_t>
+    idx_t steqr3_workquery(bool want_z,
+                           Vector<real_t<T>, idx_t> d,
+                           Vector<real_t<T>, idx_t> e,
+                           Matrix<T, layout, idx_t> Z)
+    {
+        const auto nb = idx_t{32};
+        const idx_t n = d.size();
+
+        assert(n == e.size() + 1);
+        assert(n == Z.num_columns());
+        assert(n == Z.num_rows());
+
+        // Create empty wrappers for C and S to do workspace query
+        ConstMatrix<real_t<T>, layout, idx_t> C(n - 1, nb, nullptr);
+        ConstMatrix<real_t<T>, layout, idx_t> S(n - 1, nb, nullptr);
+
+        return lasr3_workquery(Side::Right, Direction::Forward, C, S, Z);
+    }
+
+    template <typename T,
+              Layout layout,
+              typename idx_t>
+    idx_t steqr3_rworkquery(bool want_z,
+                            Vector<real_t<T>, idx_t> d,
+                            Vector<real_t<T>, idx_t> e,
+                            Matrix<T, layout, idx_t> Z)
+    {
+        const auto nb = idx_t{32};
+        const idx_t n = d.size();
+
+        assert(n == e.size() + 1);
+        assert(n == Z.num_columns());
+        assert(n == Z.num_rows());
+
+        if (layout == Layout::ColMajor)
+            return 2 * calc_ld<real_t<T>, idx_t>(n - 1) * nb;
+        else
+            return 2 * (n - 1) * calc_ld<real_t<T>, idx_t>(nb);
+    }
 
     /**
      * STEQR3 computes all eigenvalues and, optionally, eigenvectors of a
@@ -58,7 +103,13 @@ namespace lapack_cpp
         const T_real two(2);
         const T_real one(1);
         const T_real zero(0);
-        const idx_t n = size(d);
+        const idx_t n = d.size();
+
+        assert(n == e.size() + 1);
+        assert(n == Z.num_columns());
+        assert(n == Z.num_rows());
+        assert( rwork.size() >= steqr3_rworkquery(want_z, d, e, Z) );
+        assert( work.size() >= steqr3_workquery(want_z, d, e, Z) );
 
         // Amount of rotation sequences to generate before applying it.
         const auto nb = idx_t{32};
@@ -118,7 +169,7 @@ namespace lapack_cpp
             // If we have saved up enough rotations, apply them
             if (want_z and (i_block >= nb or iter == itmax or istop <= 1))
             {
-                idx_t i_block2 = min<idx_t>(i_block + 1, nb);
+                idx_t i_block2 = std::min<idx_t>(i_block + 1, nb);
 
                 // Find smallest index where rotation is not identity
                 idx_t i_start_block = n - 1;
@@ -150,9 +201,9 @@ namespace lapack_cpp
                 if (i_start_block < i_stop_block + 1)
                 {
                     auto C2 = C.submatrix(i_start_block, i_stop_block + 1, 0, i_block2);
-                    auto S2 = C.submatrix(i_start_block, i_stop_block + 1, 0, i_block2);
+                    auto S2 = S.submatrix(i_start_block, i_stop_block + 1, 0, i_block2);
                     auto Z2 = Z.submatrix(0, n, i_start_block, i_stop_block + 2);
-                    lasr3(Side::Right, direction, C2, S2, Z2, work);
+                    lasr3(Side::Right, direction, C2.as_const(), S2.as_const(), Z2, work);
                 }
                 // Reset block
                 i_block = 0;
@@ -249,7 +300,7 @@ namespace lapack_cpp
                 {
                     // We don't want different directions in our saved rotation matrices
                     // So we apply all the current rotations and reset the block
-                    idx_t i_block2 = min<idx_t>(i_block + 1, nb);
+                    idx_t i_block2 = std::min<idx_t>(i_block + 1, nb);
 
                     // Find smallest index where rotation is not identity
                     idx_t i_start_block = n - 1;
@@ -284,7 +335,7 @@ namespace lapack_cpp
                         auto S2 = S.submatrix(i_start_block, i_stop_block + 1, 0, i_block2);
                         auto Z2 = Z.submatrix(0, n, i_start_block, i_stop_block + 2);
 
-                        lasr3(Side::Right, direction, C2, S2, Z2, work);
+                        lasr3(Side::Right, direction, C2.as_const(), S2.as_const(), Z2, work);
                     }
                     // Reset block
                     i_block = 0;
@@ -408,8 +459,8 @@ namespace lapack_cpp
                 {
                     d[k] = d[i];
                     d[i] = p;
-                    auto z1 = col(Z, i);
-                    auto z2 = col(Z, k);
+                    auto z1 = Z.column(i);
+                    auto z2 = Z.column(k);
                     for (idx_t j = 0; j < n; ++j)
                     {
                         auto temp = z1[j];
diff --git a/LAPACK_CPP/src/lapack_c/lae2.f90 b/LAPACK_CPP/src/lapack_c/lae2.f90
new file mode 100644
index 000000000..083a77d4e
--- /dev/null
+++ b/LAPACK_CPP/src/lapack_c/lae2.f90
@@ -0,0 +1,25 @@
+subroutine lapack_c_slae2(a, b, c, rt1, rt2) bind(c, name='lapack_c_slae2')
+    use iso_c_binding, only: c_float
+    implicit none
+    real(kind=c_float), intent(in), value :: a, b, c
+    real(kind=c_float), intent(out) :: rt1, rt2
+
+    external slae2
+
+    ! Forward call to fortran implementation
+    call slae2( a, b, c, rt1, rt2)
+
+end subroutine lapack_c_slae2
+
+subroutine lapack_c_dlae2(a, b, c, rt1, rt2) bind(c, name='lapack_c_dlae2')
+    use iso_c_binding, only: c_double
+    implicit none
+    real(kind=c_double), intent(in), value :: a, b, c
+    real(kind=c_double), intent(out) :: rt1, rt2
+
+    external dlae2
+
+    ! Forward call to fortran implementation
+    call dlae2( a, b, c, rt1, rt2)
+
+end subroutine lapack_c_dlae2
\ No newline at end of file
diff --git a/LAPACK_CPP/src/lapack_cpp/lae2.cpp b/LAPACK_CPP/src/lapack_cpp/lae2.cpp
new file mode 100644
index 000000000..d1396edda
--- /dev/null
+++ b/LAPACK_CPP/src/lapack_cpp/lae2.cpp
@@ -0,0 +1,23 @@
+#include <cassert>
+#include <complex>
+#include <type_traits>
+
+#include "lapack_c.h"
+#include "lapack_cpp/base.hpp"
+#include "lapack_cpp/lapack/lae2.hpp"
+
+namespace lapack_cpp {
+
+template <>
+void lae2<float>(float a, float b, float c, float& s1, float& s2)
+{
+    lapack_c_slae2(a, b, c, &s1, &s2);
+}
+
+template <>
+void lae2<double>(double a, double b, double c, double& s1, double& s2)
+{
+    lapack_c_dlae2(a, b, c, &s1, &s2);
+}
+
+}  // namespace lapack_cpp
\ No newline at end of file

From 7a2ecc24e52f8dbe567dd488b61d2790b570ae7c Mon Sep 17 00:00:00 2001
From: thijssteel <thijs.steel@kuleuven.be>
Date: Wed, 6 Mar 2024 16:33:46 +0100
Subject: [PATCH 09/10] add fortran wrappers for steqr3

---
 LAPACK_CPP/example/profile_steqr3.cpp         |   8 +-
 LAPACK_CPP/include/lapack_c.h                 |   1 +
 LAPACK_CPP/include/lapack_c/steqr3.h          |  63 +++++++
 LAPACK_CPP/include/lapack_cpp/base/enums.hpp  |  17 ++
 .../include/lapack_cpp/lapack/steqr3.hpp      |  34 +++-
 LAPACK_CPP/src/lapack_c/steqr3.cpp            | 169 ++++++++++++++++++
 LAPACK_CPP/src/lapack_fortran/csteqr3.f90     |  31 ++++
 LAPACK_CPP/src/lapack_fortran/dsteqr3.f90     |  29 +++
 LAPACK_CPP/src/lapack_fortran/ssteqr3.f90     |  29 +++
 LAPACK_CPP/src/lapack_fortran/zsteqr3.f90     |  31 ++++
 10 files changed, 402 insertions(+), 10 deletions(-)
 create mode 100644 LAPACK_CPP/include/lapack_c/steqr3.h
 create mode 100644 LAPACK_CPP/src/lapack_c/steqr3.cpp
 create mode 100644 LAPACK_CPP/src/lapack_fortran/csteqr3.f90
 create mode 100644 LAPACK_CPP/src/lapack_fortran/dsteqr3.f90
 create mode 100644 LAPACK_CPP/src/lapack_fortran/ssteqr3.f90
 create mode 100644 LAPACK_CPP/src/lapack_fortran/zsteqr3.f90

diff --git a/LAPACK_CPP/example/profile_steqr3.cpp b/LAPACK_CPP/example/profile_steqr3.cpp
index d0f20db82..517dd4b86 100644
--- a/LAPACK_CPP/example/profile_steqr3.cpp
+++ b/LAPACK_CPP/example/profile_steqr3.cpp
@@ -33,10 +33,10 @@ void profile_steqr3(idx_t n)
     d_copy = d;
     e_copy = e;
 
-    bool want_z = true;
+    CompQ compz = CompQ::Initialize;
 
-    MemoryBlock<T, idx_t> work(steqr3_workquery(want_z, d, e, Z));
-    MemoryBlock<real_t<T>, idx_t> rwork(steqr3_rworkquery(want_z, d, e, Z));
+    MemoryBlock<T, idx_t> work(steqr3_workquery(compz, d, e, Z));
+    MemoryBlock<real_t<T>, idx_t> rwork(steqr3_rworkquery(compz, d, e, Z));
 
     const idx_t n_timings = 100;
     const idx_t n_warmup = 50;
@@ -47,7 +47,7 @@ void profile_steqr3(idx_t n)
         d = d_copy;
         e = e_copy;
         auto start = std::chrono::high_resolution_clock::now();
-        steqr3(want_z, d, e, Z, rwork, work);
+        steqr3(compz, d, e, Z, work, rwork);
         auto end = std::chrono::high_resolution_clock::now();
         timings[i] = std::chrono::duration<float>(end - start).count();
         std::cout<<i << " " << timings[i] << '\r' << std::flush;
diff --git a/LAPACK_CPP/include/lapack_c.h b/LAPACK_CPP/include/lapack_c.h
index 70e416e45..54d5fe637 100644
--- a/LAPACK_CPP/include/lapack_c.h
+++ b/LAPACK_CPP/include/lapack_c.h
@@ -12,5 +12,6 @@
 #include "lapack_c/lae2.h"
 #include "lapack_c/laev2.h"
 #include "lapack_c/lasrt.h"
+#include "lapack_c/steqr3.h"
 
 #endif
\ No newline at end of file
diff --git a/LAPACK_CPP/include/lapack_c/steqr3.h b/LAPACK_CPP/include/lapack_c/steqr3.h
new file mode 100644
index 000000000..57741b9d9
--- /dev/null
+++ b/LAPACK_CPP/include/lapack_c/steqr3.h
@@ -0,0 +1,63 @@
+#ifndef LAPACK_C_ROT_HPP
+#define LAPACK_C_ROT_HPP
+
+#include "lapack_c/util.h"
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif // __cplusplus
+
+    void lapack_c_ssteqr3(char compz,
+                         lapack_idx n,
+                         float *d,
+                         float *e,
+                         float *Z,
+                         lapack_idx ldz,
+                         float *work,
+                         lapack_idx lwork,
+                         float *rwork,
+                         lapack_idx lrwork,
+                         lapack_idx *info);
+
+    void lapack_c_dsteqr3(char compz,
+                         lapack_idx n,
+                         double *d,
+                         double *e,
+                         double *Z,
+                         lapack_idx ldz,
+                         double *work,
+                         lapack_idx lwork,
+                         double *rwork,
+                         lapack_idx lrwork,
+                         lapack_idx *info);
+
+    void lapack_c_csteqr3(char compz,
+                         lapack_idx n,
+                         float *d,
+                         float *e,
+                         lapack_float_complex *Z,
+                         lapack_idx ldz,
+                         lapack_float_complex *work,
+                         lapack_idx lwork,
+                         float *rwork,
+                         lapack_idx lrwork,
+                         lapack_idx *info);
+
+    void lapack_c_zsteqr3(char compz,
+                         lapack_idx n,
+                         double *d,
+                         double *e,
+                         lapack_double_complex *Z,
+                         lapack_idx ldz,
+                         lapack_double_complex *work,
+                         lapack_idx lwork,
+                         double *rwork,
+                         lapack_idx lrwork,
+                         lapack_idx *info);
+
+#ifdef __cplusplus
+}
+#endif // __cplusplus
+
+#endif
\ No newline at end of file
diff --git a/LAPACK_CPP/include/lapack_cpp/base/enums.hpp b/LAPACK_CPP/include/lapack_cpp/base/enums.hpp
index 6bd6c5365..597c899fe 100644
--- a/LAPACK_CPP/include/lapack_cpp/base/enums.hpp
+++ b/LAPACK_CPP/include/lapack_cpp/base/enums.hpp
@@ -99,6 +99,23 @@ inline constexpr IncDec char2incdec(char t)
     }
 }
 
+enum class CompQ : char { No = 'N', Update = 'V', Initialize = 'I' };
+
+inline constexpr CompQ char2compq(char t)
+{
+    switch (t) {
+        case 'N':
+            return CompQ::No;
+        case 'V':
+            return CompQ::Update;
+        case 'I':
+            return CompQ::Initialize;
+        default:
+            assert(false);
+            return CompQ::No;
+    }
+}
+
 }  // namespace lapack_cpp
 
 #endif  // LAPACK_CPP_ENUMP_HPP
\ No newline at end of file
diff --git a/LAPACK_CPP/include/lapack_cpp/lapack/steqr3.hpp b/LAPACK_CPP/include/lapack_cpp/lapack/steqr3.hpp
index 38333c512..11016f6ed 100644
--- a/LAPACK_CPP/include/lapack_cpp/lapack/steqr3.hpp
+++ b/LAPACK_CPP/include/lapack_cpp/lapack/steqr3.hpp
@@ -15,11 +15,13 @@ namespace lapack_cpp
     template <typename T,
               Layout layout,
               typename idx_t>
-    idx_t steqr3_workquery(bool want_z,
+    idx_t steqr3_workquery(CompQ compz,
                            Vector<real_t<T>, idx_t> d,
                            Vector<real_t<T>, idx_t> e,
                            Matrix<T, layout, idx_t> Z)
     {
+        if( compz == CompQ::No )
+            return 0;
         const auto nb = idx_t{32};
         const idx_t n = d.size();
 
@@ -37,7 +39,7 @@ namespace lapack_cpp
     template <typename T,
               Layout layout,
               typename idx_t>
-    idx_t steqr3_rworkquery(bool want_z,
+    idx_t steqr3_rworkquery(CompQ compz,
                             Vector<real_t<T>, idx_t> d,
                             Vector<real_t<T>, idx_t> e,
                             Matrix<T, layout, idx_t> Z)
@@ -82,6 +84,12 @@ namespace lapack_cpp
      *      to tridiagonal form.
      *      On exit, if info = 0, and want_z=true then Z contains the
      *      orthonormal eigenvectors of the original Hermitian matrix.
+     * 
+     * @param[out] work Workspace array whose size is returned by
+     *    steqr3_workquery.
+     * 
+     * @param[out] rwork Real workspace array whose size is returned by
+     *     steqr3_rworkquery.
      *
      * @ingroup computational
      */
@@ -90,12 +98,12 @@ namespace lapack_cpp
               typename idx_t,
               bool aligned>
     int steqr3(
-        bool want_z,
+        CompQ compz,
         Vector<real_t<T>, idx_t> d,
         Vector<real_t<T>, idx_t> e,
         Matrix<T, layout, idx_t> Z,
-        MemoryBlock<real_t<T>, idx_t, aligned> &rwork,
-        MemoryBlock<T, idx_t, aligned> &work)
+        MemoryBlock<T, idx_t, aligned> &work,
+        MemoryBlock<real_t<T>, idx_t, aligned> &rwork)
     {
         using T_real = real_t<T>;
 
@@ -119,11 +127,25 @@ namespace lapack_cpp
             return 0;
         if (n == 1)
         {
-            if (want_z)
+            if (compz == CompQ::Initialize)
                 Z(0, 0) = one;
             return 0;
         }
 
+        bool want_z = compz != CompQ::No;
+
+        if( compz == CompQ::Initialize )
+        {
+            for (idx_t j = 0; j < n; ++j)
+            {
+                for (idx_t i = 0; i < n; ++i)
+                {
+                    Z(i, j) = zero;
+                }
+                Z(j, j) = one;
+            }
+        }
+
         // Determine the unit roundoff and over/underflow thresholds.
         const T_real eps = ulp<T_real>();
         const T_real eps2 = eps * eps;
diff --git a/LAPACK_CPP/src/lapack_c/steqr3.cpp b/LAPACK_CPP/src/lapack_c/steqr3.cpp
new file mode 100644
index 000000000..61f1fd64f
--- /dev/null
+++ b/LAPACK_CPP/src/lapack_c/steqr3.cpp
@@ -0,0 +1,169 @@
+#include "lapack_c/steqr3.h"
+#include "lapack_cpp/base.hpp"
+#include "lapack_cpp/lapack/steqr3.hpp"
+
+using namespace lapack_cpp;
+
+// Actual definitions
+extern "C"
+{
+
+    void lapack_c_ssteqr3(char compz,
+                          lapack_idx n,
+                          float *d,
+                          float *e,
+                          float *Z,
+                          lapack_idx ldz,
+                          float *work,
+                          lapack_idx lwork,
+                          float *rwork,
+                          lapack_idx lrwork,
+                          lapack_idx *info)
+    {
+        CompQ compz_ = char2compq(compz);
+
+        Vector<float, lapack_idx> d_(n, d);
+        Vector<float, lapack_idx> e_(n-1, e);
+        Matrix<float, Layout::ColMajor, lapack_idx> Z_(n, n, Z, ldz);
+
+        if (lwork == -1)
+        {
+            lapack_idx lwork_ = steqr3_workquery(compz_, d_, e_, Z_);
+            *work = lwork_;
+        }
+
+        if (lrwork == -1)
+        {
+            lapack_idx lrwork_ = steqr3_rworkquery(compz_, d_, e_, Z_);
+            *rwork = lrwork_;
+        }
+
+        if( lwork == -1 || lrwork == -1)
+            return;
+
+        MemoryBlock<float, lapack_idx> work_(lwork, work);
+        MemoryBlock<float, lapack_idx> rwork_(lrwork, rwork);
+
+        *info = steqr3(compz_, d_, e_, Z_, work_, rwork_);
+    }
+
+
+    void lapack_c_dsteqr3(char compz,
+                          lapack_idx n,
+                          double *d,
+                          double *e,
+                          double *Z,
+                          lapack_idx ldz,
+                          double *work,
+                          lapack_idx lwork,
+                          double *rwork,
+                          lapack_idx lrwork,
+                          lapack_idx *info)
+    {
+        CompQ compz_ = char2compq(compz);
+
+        Vector<double, lapack_idx> d_(n, d);
+        Vector<double, lapack_idx> e_(n-1, e);
+        Matrix<double, Layout::ColMajor, lapack_idx> Z_(n, n, Z, ldz);
+
+        if (lwork == -1)
+        {
+            lapack_idx lwork_ = steqr3_workquery(compz_, d_, e_, Z_);
+            *work = lwork_;
+        }
+
+        if (lrwork == -1)
+        {
+            lapack_idx lrwork_ = steqr3_rworkquery(compz_, d_, e_, Z_);
+            *rwork = lrwork_;
+        }
+
+        if( lwork == -1 || lrwork == -1)
+            return;
+
+        MemoryBlock<double, lapack_idx> work_(lwork, work);
+        MemoryBlock<double, lapack_idx> rwork_(lrwork, rwork);
+
+        *info = steqr3(compz_, d_, e_, Z_, work_, rwork_);
+    }
+
+    void lapack_c_csteqr3(char compz,
+                          lapack_idx n,
+                          float *d,
+                          float *e,
+                          lapack_float_complex *Z,
+                          lapack_idx ldz,
+                          lapack_float_complex *work,
+                          lapack_idx lwork,
+                          float *rwork,
+                          lapack_idx lrwork,
+                          lapack_idx *info)
+    {
+        CompQ compz_ = char2compq(compz);
+
+        Vector<float, lapack_idx> d_(n, d);
+        Vector<float, lapack_idx> e_(n-1, e);
+        Matrix<lapack_float_complex, Layout::ColMajor, lapack_idx> Z_(n, n, Z, ldz);
+
+        if (lwork == -1)
+        {
+            lapack_idx lwork_ = steqr3_workquery(compz_, d_, e_, Z_);
+            *work = lwork_;
+        }
+
+        if (lrwork == -1)
+        {
+            lapack_idx lrwork_ = steqr3_rworkquery(compz_, d_, e_, Z_);
+            *rwork = lrwork_;
+        }
+
+        if( lwork == -1 || lrwork == -1)
+            return;
+
+        MemoryBlock<lapack_float_complex, lapack_idx> work_(lwork, work);
+        MemoryBlock<float, lapack_idx> rwork_(lrwork, rwork);
+
+        *info = steqr3(compz_, d_, e_, Z_, work_, rwork_);
+    }
+
+
+    void lapack_c_zsteqr3(char compz,
+                          lapack_idx n,
+                          double *d,
+                          double *e,
+                          lapack_double_complex *Z,
+                          lapack_idx ldz,
+                          lapack_double_complex *work,
+                          lapack_idx lwork,
+                          double *rwork,
+                          lapack_idx lrwork,
+                          lapack_idx *info)
+    {
+        CompQ compz_ = char2compq(compz);
+
+        Vector<double, lapack_idx> d_(n, d);
+        Vector<double, lapack_idx> e_(n-1, e);
+        Matrix<lapack_double_complex, Layout::ColMajor, lapack_idx> Z_(n, n, Z, ldz);
+
+        if (lwork == -1)
+        {
+            lapack_idx lwork_ = steqr3_workquery(compz_, d_, e_, Z_);
+            *work = lwork_;
+        }
+
+        if (lrwork == -1)
+        {
+            lapack_idx lrwork_ = steqr3_rworkquery(compz_, d_, e_, Z_);
+            *rwork = lrwork_;
+        }
+
+        if( lwork == -1 || lrwork == -1)
+            return;
+
+        MemoryBlock<lapack_double_complex, lapack_idx> work_(lwork, work);
+        MemoryBlock<double, lapack_idx> rwork_(lrwork, rwork);
+
+        *info = steqr3(compz_, d_, e_, Z_, work_, rwork_);
+    }
+
+} // extern "C"
\ No newline at end of file
diff --git a/LAPACK_CPP/src/lapack_fortran/csteqr3.f90 b/LAPACK_CPP/src/lapack_fortran/csteqr3.f90
new file mode 100644
index 000000000..eb1115aa8
--- /dev/null
+++ b/LAPACK_CPP/src/lapack_fortran/csteqr3.f90
@@ -0,0 +1,31 @@
+subroutine csteqr3 ( compz, n, d, e, z, ldz, work, lwork, rwork, lrwork, info )
+    use iso_c_binding, only: c_char
+    implicit none
+    ! Interface to C
+    interface
+        subroutine lapack_c_csteqr3(compz_c, n_c, d_c, e_c, z_c, ldz_c, work_c,&
+             lwork_c, rwork_c, lrwork_c, info_c) bind(c, name="lapack_c_csteqr3")
+            use iso_c_binding, only: c_char, c_float, c_float_complex, c_int
+            implicit none
+            integer(c_int), intent(in) :: n_c, ldz_c, lwork_c, lrwork_c
+            integer(c_int), intent(out) :: info_c
+            character(c_char), intent(in) :: compz_c
+            real(c_float), intent(inout) :: d_c(*), e_c(*), rwork_c(*)
+            complex(c_float_complex), intent(inout) :: z_c(ldz_c,*), work_c(*)
+        end subroutine lapack_c_csteqr3
+    end interface
+
+    ! Arguments
+    integer, intent(in) :: n, ldz, lwork, lrwork
+    integer, intent(out) :: info
+    character, intent(in) :: compz
+    real, intent(inout) :: d(*), e(*), rwork(*)
+    complex, intent(inout) :: z(ldz,*), work(*)
+    character(c_char) :: compz_to_c
+
+    ! Convert characters to C characters
+    compz_to_c = compz
+
+    ! Call C function
+    call lapack_c_csteqr3(compz_to_c, n, d, e, z, ldz, work, lwork, rwork, lrwork, info)
+end subroutine csteqr3
diff --git a/LAPACK_CPP/src/lapack_fortran/dsteqr3.f90 b/LAPACK_CPP/src/lapack_fortran/dsteqr3.f90
new file mode 100644
index 000000000..d4aae1fc4
--- /dev/null
+++ b/LAPACK_CPP/src/lapack_fortran/dsteqr3.f90
@@ -0,0 +1,29 @@
+subroutine dsteqr3 ( compz, n, d, e, z, ldz, work, lwork, rwork, lrwork, info )
+    use iso_c_binding, only: c_char
+    implicit none
+    ! Interface to C
+    interface
+        subroutine lapack_c_dsteqr3(compz_c, n_c, d_c, e_c, z_c, ldz_c, work_c,&
+             lwork_c, rwork_c, lrwork_c, info_c) bind(c, name="lapack_c_dsteqr3")
+            use iso_c_binding, only: c_char, c_double, c_double_complex, c_int
+            implicit none
+            integer(c_int), intent(in) :: n_c, ldz_c, lwork_c, lrwork_c
+            integer(c_int), intent(out) :: info_c
+            character(c_char), intent(in) :: compz_c
+            real(c_double), intent(inout) :: d_c(*), e_c(*), rwork_c(*), z_c(ldz_c,*), work_c(*)
+        end subroutine lapack_c_dsteqr3
+    end interface
+
+    ! Arguments
+    integer, intent(in) :: n, ldz, lwork, lrwork
+    integer, intent(out) :: info
+    character, intent(in) :: compz
+    double precision, intent(inout) :: d(*), e(*), rwork(*), z(ldz,*), work(*)
+    character(c_char) :: compz_to_c
+
+    ! Convert characters to C characters
+    compz_to_c = compz
+
+    ! Call C function
+    call lapack_c_dsteqr3(compz_to_c, n, d, e, z, ldz, work, lwork, rwork, lrwork, info)
+end subroutine dsteqr3
diff --git a/LAPACK_CPP/src/lapack_fortran/ssteqr3.f90 b/LAPACK_CPP/src/lapack_fortran/ssteqr3.f90
new file mode 100644
index 000000000..e40e278d9
--- /dev/null
+++ b/LAPACK_CPP/src/lapack_fortran/ssteqr3.f90
@@ -0,0 +1,29 @@
+subroutine ssteqr3 ( compz, n, d, e, z, ldz, work, lwork, rwork, lrwork, info )
+    use iso_c_binding, only: c_char
+    implicit none
+    ! Interface to C
+    interface
+        subroutine lapack_c_ssteqr3(compz_c, n_c, d_c, e_c, z_c, ldz_c, work_c,&
+             lwork_c, rwork_c, lrwork_c, info_c) bind(c, name="lapack_c_ssteqr3")
+            use iso_c_binding, only: c_char, c_float, c_float_complex, c_int
+            implicit none
+            integer(c_int), intent(in) :: n_c, ldz_c, lwork_c, lrwork_c
+            integer(c_int), intent(out) :: info_c
+            character(c_char), intent(in) :: compz_c
+            real(c_float), intent(inout) :: d_c(*), e_c(*), rwork_c(*), z_c(ldz_c,*), work_c(*)
+        end subroutine lapack_c_ssteqr3
+    end interface
+
+    ! Arguments
+    integer, intent(in) :: n, ldz, lwork, lrwork
+    integer, intent(out) :: info
+    character, intent(in) :: compz
+    real, intent(inout) :: d(*), e(*), rwork(*), z(ldz,*), work(*)
+    character(c_char) :: compz_to_c
+
+    ! Convert characters to C characters
+    compz_to_c = compz
+
+    ! Call C function
+    call lapack_c_ssteqr3(compz_to_c, n, d, e, z, ldz, work, lwork, rwork, lrwork, info)
+end subroutine ssteqr3
diff --git a/LAPACK_CPP/src/lapack_fortran/zsteqr3.f90 b/LAPACK_CPP/src/lapack_fortran/zsteqr3.f90
new file mode 100644
index 000000000..f12321579
--- /dev/null
+++ b/LAPACK_CPP/src/lapack_fortran/zsteqr3.f90
@@ -0,0 +1,31 @@
+subroutine zsteqr3 ( compz, n, d, e, z, ldz, work, lwork, rwork, lrwork, info )
+    use iso_c_binding, only: c_char
+    implicit none
+    ! Interface to C
+    interface
+        subroutine lapack_c_zsteqr3(compz_c, n_c, d_c, e_c, z_c, ldz_c, work_c,&
+             lwork_c, rwork_c, lrwork_c, info_c) bind(c, name="lapack_c_zsteqr3")
+            use iso_c_binding, only: c_char, c_double, c_double_complex, c_int
+            implicit none
+            integer(c_int), intent(in) :: n_c, ldz_c, lwork_c, lrwork_c
+            integer(c_int), intent(out) :: info_c
+            character(c_char), intent(in) :: compz_c
+            real(c_double), intent(inout) :: d_c(*), e_c(*), rwork_c(*)
+            complex(c_double_complex), intent(inout) :: z_c(ldz_c,*), work_c(*)
+        end subroutine lapack_c_zsteqr3
+    end interface
+
+    ! Arguments
+    integer, intent(in) :: n, ldz, lwork, lrwork
+    integer, intent(out) :: info
+    character, intent(in) :: compz
+    double precision, intent(inout) :: d(*), e(*), rwork(*)
+    double complex, intent(inout) :: z(ldz,*), work(*)
+    character(c_char) :: compz_to_c
+
+    ! Convert characters to C characters
+    compz_to_c = compz
+
+    ! Call C function
+    call lapack_c_zsteqr3(compz_to_c, n, d, e, z, ldz, work, lwork, rwork, lrwork, info)
+end subroutine zsteqr3

From 7094cb0f7c8e379ab56517e2bf529a4f6e14e6a6 Mon Sep 17 00:00:00 2001
From: thijssteel <thijs.steel@kuleuven.be>
Date: Wed, 6 Mar 2024 17:07:20 +0100
Subject: [PATCH 10/10] add steqr wrappers for comparison

---
 LAPACK_CPP/example/profile_steqr3.cpp         |  50 ++++--
 LAPACK_CPP/include/lapack_c.h                 |   1 +
 LAPACK_CPP/include/lapack_c/steqr.h           |  51 ++++++
 LAPACK_CPP/include/lapack_c/steqr3.h          |   6 +-
 LAPACK_CPP/include/lapack_cpp/lapack.hpp      |   1 +
 .../include/lapack_cpp/lapack/lasr3.hpp       |   7 +
 .../include/lapack_cpp/lapack/steqr.hpp       |  25 +++
 .../include/lapack_cpp/lapack/steqr3.hpp      |   2 +-
 LAPACK_CPP/src/lapack_c/steqr.f90             |  65 ++++++++
 LAPACK_CPP/src/lapack_cpp/steqr.cpp           | 151 ++++++++++++++++++
 10 files changed, 340 insertions(+), 19 deletions(-)
 create mode 100644 LAPACK_CPP/include/lapack_c/steqr.h
 create mode 100644 LAPACK_CPP/include/lapack_cpp/lapack/steqr.hpp
 create mode 100644 LAPACK_CPP/src/lapack_c/steqr.f90
 create mode 100644 LAPACK_CPP/src/lapack_cpp/steqr.cpp

diff --git a/LAPACK_CPP/example/profile_steqr3.cpp b/LAPACK_CPP/example/profile_steqr3.cpp
index 517dd4b86..f6807c498 100644
--- a/LAPACK_CPP/example/profile_steqr3.cpp
+++ b/LAPACK_CPP/example/profile_steqr3.cpp
@@ -10,7 +10,7 @@ using namespace lapack_cpp;
 template <typename T,
           Layout layout = Layout::ColMajor,
           typename idx_t = lapack_idx_t>
-void profile_steqr3(idx_t n)
+void profile_steqr3(idx_t n, bool use_fortran = false)
 {
     MemoryBlock<T, idx_t> Z_(n, n, layout);
     Matrix<T, layout, idx_t> Z(n, n, Z_);
@@ -35,22 +35,38 @@ void profile_steqr3(idx_t n)
 
     CompQ compz = CompQ::Initialize;
 
-    MemoryBlock<T, idx_t> work(steqr3_workquery(compz, d, e, Z));
-    MemoryBlock<real_t<T>, idx_t> rwork(steqr3_rworkquery(compz, d, e, Z));
-
     const idx_t n_timings = 100;
     const idx_t n_warmup = 50;
     std::vector<float> timings(n_timings);
 
-    for (idx_t i = 0; i < n_timings; ++i)
+    if (use_fortran)
+    {
+        MemoryBlock<real_t<T>, idx_t> rwork(2 * n - 2);
+        for (idx_t i = 0; i < n_timings; ++i)
+        {
+            d = d_copy;
+            e = e_copy;
+            auto start = std::chrono::high_resolution_clock::now();
+            steqr(compz, d, e, Z, rwork);
+            auto end = std::chrono::high_resolution_clock::now();
+            timings[i] = std::chrono::duration<float>(end - start).count();
+            std::cout << i << " " << timings[i] << '\r' << std::flush;
+        }
+    }
+    else
     {
-        d = d_copy;
-        e = e_copy;
-        auto start = std::chrono::high_resolution_clock::now();
-        steqr3(compz, d, e, Z, work, rwork);
-        auto end = std::chrono::high_resolution_clock::now();
-        timings[i] = std::chrono::duration<float>(end - start).count();
-        std::cout<<i << " " << timings[i] << '\r' << std::flush;
+        MemoryBlock<T, idx_t> work(steqr3_workquery(compz, d, e, Z));
+        MemoryBlock<real_t<T>, idx_t> rwork(steqr3_rworkquery(compz, d, e, Z));
+        for (idx_t i = 0; i < n_timings; ++i)
+        {
+            d = d_copy;
+            e = e_copy;
+            auto start = std::chrono::high_resolution_clock::now();
+            steqr3(compz, d, e, Z, work, rwork);
+            auto end = std::chrono::high_resolution_clock::now();
+            timings[i] = std::chrono::duration<float>(end - start).count();
+            std::cout << i << " " << timings[i] << '\r' << std::flush;
+        }
     }
 
     float mean = 0;
@@ -63,7 +79,7 @@ void profile_steqr3(idx_t n)
         std_dev += (timings[i] - mean) * (timings[i] - mean);
     std_dev = std::sqrt(std_dev / (n_timings - n_warmup - 1));
 
-    std::cout << "n = " << n << ", mean time = " << mean
+    std::cout << "n = " << n << ", using fortran: " << use_fortran << ", mean time = " << mean
               << " s"
               << ", std dev = " << std_dev / mean * 100 << " %"
               << std::endl;
@@ -74,9 +90,13 @@ int main()
     typedef lapack_idx_t idx_t;
     typedef double T;
 
-    for (int n = 32; n <= 4000; n *= 2)
+    for (int n = 32; n <= 2000; n *= 2)
+    {
+        profile_steqr3<T, Layout::ColMajor, idx_t>(n, false);
+    }
+    for (int n = 32; n <= 2000; n *= 2)
     {
-        profile_steqr3<T, Layout::ColMajor, idx_t>(n);
+        profile_steqr3<T, Layout::ColMajor, idx_t>(n, true);
     }
     return 0;
 }
\ No newline at end of file
diff --git a/LAPACK_CPP/include/lapack_c.h b/LAPACK_CPP/include/lapack_c.h
index 54d5fe637..f3fc0b7d4 100644
--- a/LAPACK_CPP/include/lapack_c.h
+++ b/LAPACK_CPP/include/lapack_c.h
@@ -12,6 +12,7 @@
 #include "lapack_c/lae2.h"
 #include "lapack_c/laev2.h"
 #include "lapack_c/lasrt.h"
+#include "lapack_c/steqr.h"
 #include "lapack_c/steqr3.h"
 
 #endif
\ No newline at end of file
diff --git a/LAPACK_CPP/include/lapack_c/steqr.h b/LAPACK_CPP/include/lapack_c/steqr.h
new file mode 100644
index 000000000..5d8db0065
--- /dev/null
+++ b/LAPACK_CPP/include/lapack_c/steqr.h
@@ -0,0 +1,51 @@
+#ifndef LAPACK_C_STEQR_HPP
+#define LAPACK_C_STEQR_HPP
+
+#include "lapack_c/util.h"
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif // __cplusplus
+
+    void lapack_c_ssteqr(char compz,
+                         lapack_idx n,
+                         float *d,
+                         float *e,
+                         float *Z,
+                         lapack_idx ldz,
+                         float *work,
+                         lapack_idx *info);
+
+    void lapack_c_dsteqr(char compz,
+                         lapack_idx n,
+                         double *d,
+                         double *e,
+                         double *Z,
+                         lapack_idx ldz,
+                         double *work,
+                         lapack_idx *info);
+
+    void lapack_c_csteqr(char compz,
+                         lapack_idx n,
+                         float *d,
+                         float *e,
+                         lapack_float_complex *Z,
+                         lapack_idx ldz,
+                         float *work,
+                         lapack_idx *info);
+
+    void lapack_c_zsteqr(char compz,
+                         lapack_idx n,
+                         double *d,
+                         double *e,
+                         lapack_double_complex *Z,
+                         lapack_idx ldz,
+                         double *work,
+                         lapack_idx *info);
+
+#ifdef __cplusplus
+}
+#endif // __cplusplus
+
+#endif // LAPACK_C_STEQR_HPP
\ No newline at end of file
diff --git a/LAPACK_CPP/include/lapack_c/steqr3.h b/LAPACK_CPP/include/lapack_c/steqr3.h
index 57741b9d9..b1ae85fd1 100644
--- a/LAPACK_CPP/include/lapack_c/steqr3.h
+++ b/LAPACK_CPP/include/lapack_c/steqr3.h
@@ -1,5 +1,5 @@
-#ifndef LAPACK_C_ROT_HPP
-#define LAPACK_C_ROT_HPP
+#ifndef LAPACK_C_STEQR3_HPP
+#define LAPACK_C_STEQR3_HPP
 
 #include "lapack_c/util.h"
 
@@ -60,4 +60,4 @@ extern "C"
 }
 #endif // __cplusplus
 
-#endif
\ No newline at end of file
+#endif // LAPACK_C_STEQR3_HPP
\ No newline at end of file
diff --git a/LAPACK_CPP/include/lapack_cpp/lapack.hpp b/LAPACK_CPP/include/lapack_cpp/lapack.hpp
index bb515fbf2..29b84f589 100644
--- a/LAPACK_CPP/include/lapack_cpp/lapack.hpp
+++ b/LAPACK_CPP/include/lapack_cpp/lapack.hpp
@@ -6,6 +6,7 @@
 #include "lapack_cpp/lapack/lartg.hpp"
 #include "lapack_cpp/lapack/lasr3.hpp"
 #include "lapack_cpp/lapack/lasrt.hpp"
+#include "lapack_cpp/lapack/steqr.hpp"
 #include "lapack_cpp/lapack/steqr3.hpp"
 
 #endif // LAPACK_CPP_LAPACK_HPP
\ No newline at end of file
diff --git a/LAPACK_CPP/include/lapack_cpp/lapack/lasr3.hpp b/LAPACK_CPP/include/lapack_cpp/lapack/lasr3.hpp
index a1ecfa66f..b91101b14 100644
--- a/LAPACK_CPP/include/lapack_cpp/lapack/lasr3.hpp
+++ b/LAPACK_CPP/include/lapack_cpp/lapack/lasr3.hpp
@@ -127,6 +127,8 @@ namespace internal {
                      TS s2)
     {
         assert(x1.size() == x2.size() && x2.size() == x3.size());
+        if( c1 == real_t<T>{1.} && s1 == TS{0.} && c2 == real_t<T>{1.} && s2 == TS{0} )
+            return;
         const idx_t n = x1.size();
         for (idx_t i = 0; i < n; ++i) {
             T x2_g1 = -conj(s1) * x1[i] + c1 * x2[i];
@@ -160,6 +162,8 @@ namespace internal {
                      TS s2)
     {
         assert(x1.size() == x2.size() && x2.size() == x3.size());
+        if( c1 == real_t<T>{1.} && s1 == TS{0.} && c2 == real_t<T>{1.} && s2 == TS{0} )
+            return;
         const idx_t n = x1.size();
         for (idx_t i = 0; i < n; ++i) {
             T x2_g1 = c1 * x2[i] + s1 * x3[i];
@@ -208,6 +212,9 @@ namespace internal {
     {
         assert(x1.size() == x2.size() && x1.size() == x3.size() &&
                x1.size() == x4.size());
+        if( c1 == real_t<T>{1.} && s1 == TS{0.} && c2 == real_t<T>{1.} && s2 == TS{0} &&
+            c3 == real_t<T>{1.} && s3 == TS{0.} && c4 == real_t<T>{1.} && s4 == TS{0} )
+            return;
         const idx_t n = x1.size();
         for (idx_t i = 0; i < n; ++i) {
             T x2_g1 = c1 * x2[i] + s1 * x3[i];
diff --git a/LAPACK_CPP/include/lapack_cpp/lapack/steqr.hpp b/LAPACK_CPP/include/lapack_cpp/lapack/steqr.hpp
new file mode 100644
index 000000000..dd8fa211f
--- /dev/null
+++ b/LAPACK_CPP/include/lapack_cpp/lapack/steqr.hpp
@@ -0,0 +1,25 @@
+#ifndef LAPACK_CPP_STEQR_HPP
+#define LAPACK_CPP_STEQR_HPP
+
+#include "lapack_cpp/base.hpp"
+
+namespace lapack_cpp
+{
+
+    /**
+     * Computes all eigenvalues and, optionally, eigenvectors of a symmetric tridiagonal matrix using the QR algorithm.
+     */
+    template <typename T,
+              Layout layout,
+              typename idx_t,
+              bool aligned>
+    int steqr(
+        CompQ compz,
+        Vector<real_t<T>, idx_t> d,
+        Vector<real_t<T>, idx_t> e,
+        Matrix<T, layout, idx_t> Z,
+        MemoryBlock<real_t<T>, idx_t, aligned> &work);
+
+}
+
+#endif // LAPACK_CPP_STEQR_HPP
\ No newline at end of file
diff --git a/LAPACK_CPP/include/lapack_cpp/lapack/steqr3.hpp b/LAPACK_CPP/include/lapack_cpp/lapack/steqr3.hpp
index 11016f6ed..00e564f8e 100644
--- a/LAPACK_CPP/include/lapack_cpp/lapack/steqr3.hpp
+++ b/LAPACK_CPP/include/lapack_cpp/lapack/steqr3.hpp
@@ -498,4 +498,4 @@ namespace lapack_cpp
 
 }
 
-#endif
\ No newline at end of file
+#endif // LAPACK_CPP_STEQR3_HPP
\ No newline at end of file
diff --git a/LAPACK_CPP/src/lapack_c/steqr.f90 b/LAPACK_CPP/src/lapack_c/steqr.f90
new file mode 100644
index 000000000..f1a8e0bf7
--- /dev/null
+++ b/LAPACK_CPP/src/lapack_c/steqr.f90
@@ -0,0 +1,65 @@
+subroutine lapack_c_ssteqr(compz, n, d, e, Z, ldz, work, info) bind(c, name='lapack_c_ssteqr')
+    use iso_c_binding
+    implicit none
+    character(kind=c_char), intent(in), value :: compz
+    integer(kind=c_int), intent(in), value :: n, ldz
+    real(kind=c_float), intent(inout) :: d(*), e(*), Z(ldz,*)
+    real(kind=c_float), intent(out) :: work(*)
+    integer(kind=c_int), intent(out) :: info
+
+    external ssteqr
+
+    ! Forward call to fortran implementation
+    call ssteqr( compz, n, d, e, Z, ldz, work, info )
+
+end subroutine lapack_c_ssteqr
+
+subroutine lapack_c_dsteqr(compz, n, d, e, Z, ldz, work, info) bind(c, name='lapack_c_dsteqr')
+    use iso_c_binding
+    implicit none
+    character(kind=c_char), intent(in), value :: compz
+    integer(kind=c_int), intent(in), value :: n, ldz
+    real(kind=c_double), intent(inout) :: d(*), e(*), Z(ldz,*)
+    real(kind=c_double), intent(out) :: work(*)
+    integer(kind=c_int), intent(out) :: info
+
+    external dsteqr
+
+    ! Forward call to fortran implementation
+    call dsteqr( compz, n, d, e, Z, ldz, work, info )
+
+end subroutine lapack_c_dsteqr
+
+subroutine lapack_c_csteqr(compz, n, d, e, Z, ldz, work, info) bind(c, name='lapack_c_csteqr')
+    use iso_c_binding
+    implicit none
+    character(kind=c_char), intent(in), value :: compz
+    integer(kind=c_int), intent(in), value :: n, ldz
+    real(kind=c_float), intent(inout) :: d(*), e(*)
+    complex(kind=c_float_complex), intent(inout) :: Z(ldz,*)
+    real(kind=c_float), intent(out) :: work(*)
+    integer(kind=c_int), intent(out) :: info
+
+    external csteqr
+
+    ! Forward call to fortran implementation
+    call csteqr( compz, n, d, e, Z, ldz, work, info )
+
+end subroutine lapack_c_csteqr
+
+subroutine lapack_c_zsteqr(compz, n, d, e, Z, ldz, work, info) bind(c, name='lapack_c_zsteqr')
+    use iso_c_binding
+    implicit none
+    character(kind=c_char), intent(in), value :: compz
+    integer(kind=c_int), intent(in), value :: n, ldz
+    real(kind=c_double), intent(inout) :: d(*), e(*)
+    complex(kind=c_double_complex), intent(inout) :: Z(ldz,*)
+    real(kind=c_double), intent(out) :: work(*)
+    integer(kind=c_int), intent(out) :: info
+
+    external zsteqr
+
+    ! Forward call to fortran implementation
+    call zsteqr( compz, n, d, e, Z, ldz, work, info )
+
+end subroutine lapack_c_zsteqr
\ No newline at end of file
diff --git a/LAPACK_CPP/src/lapack_cpp/steqr.cpp b/LAPACK_CPP/src/lapack_cpp/steqr.cpp
new file mode 100644
index 000000000..a484c7766
--- /dev/null
+++ b/LAPACK_CPP/src/lapack_cpp/steqr.cpp
@@ -0,0 +1,151 @@
+#include <cassert>
+#include <complex>
+#include <type_traits>
+
+#include "lapack_c.h"
+#include "lapack_cpp/base.hpp"
+#include "lapack_cpp/lapack/steqr.hpp"
+
+namespace lapack_cpp
+{
+
+    template <>
+    int steqr(CompQ compz, Vector<float, int> d, Vector<float, int> e, Matrix<float, Layout::ColMajor, int> Z, MemoryBlock<float, int, true> &work)
+    {
+        assert(d.size() >= 0);
+        assert(d.size() == e.size() + 1);
+        assert(d.size() == Z.num_rows());
+        assert(d.size() == Z.num_columns());
+        assert(2*d.size()-2 == work.size());
+        // fortran code only supports stride 1
+        assert(d.stride() == 1);
+        assert(e.stride() == 1);
+        lapack_idx info;
+        lapack_c_ssteqr((char)compz, d.size(), d.ptr(), e.ptr(), Z.ptr(), Z.ldim(), work.ptr(), &info);
+
+        return info;
+    }
+
+    template <>
+    int steqr(CompQ compz, Vector<double, int> d, Vector<double, int> e, Matrix<double, Layout::ColMajor, int> Z, MemoryBlock<double, int, true> &work)
+    {
+        assert(d.size() >= 0);
+        assert(d.size() == e.size() + 1);
+        assert(d.size() == Z.num_rows());
+        assert(d.size() == Z.num_columns());
+        assert(2*d.size()-2 == work.size());
+        // fortran code only supports stride 1
+        assert(d.stride() == 1);
+        assert(e.stride() == 1);
+        lapack_idx info;
+        lapack_c_dsteqr((char)compz, d.size(), d.ptr(), e.ptr(), Z.ptr(), Z.ldim(), work.ptr(), &info);
+
+        return info;
+    }
+
+
+    template <>
+    int steqr(CompQ compz, Vector<float, int> d, Vector<float, int> e, Matrix<std::complex<float>, Layout::ColMajor, int> Z, MemoryBlock<float, int, true> &work)
+    {
+        assert(d.size() >= 0);
+        assert(d.size() == e.size() + 1);
+        assert(d.size() == Z.num_rows());
+        assert(d.size() == Z.num_columns());
+        assert(2*d.size()-2 == work.size());
+        // fortran code only supports stride 1
+        assert(d.stride() == 1);
+        assert(e.stride() == 1);
+        lapack_idx info;
+        lapack_c_csteqr((char)compz, d.size(), d.ptr(), e.ptr(), Z.ptr(), Z.ldim(), work.ptr(), &info);
+
+        return info;
+    }
+
+    template <>
+    int steqr(CompQ compz, Vector<double, int> d, Vector<double, int> e, Matrix<std::complex<double>, Layout::ColMajor, int> Z, MemoryBlock<double, int, true> &work)
+    {
+        assert(d.size() >= 0);
+        assert(d.size() == e.size() + 1);
+        assert(d.size() == Z.num_rows());
+        assert(d.size() == Z.num_columns());
+        assert(2*d.size()-2 == work.size());
+        // fortran code only supports stride 1
+        assert(d.stride() == 1);
+        assert(e.stride() == 1);
+        lapack_idx info;
+        lapack_c_zsteqr((char)compz, d.size(), d.ptr(), e.ptr(), Z.ptr(), Z.ldim(), work.ptr(), &info);
+
+        return info;
+    }
+
+
+    template <>
+    int steqr(CompQ compz, Vector<float, int64_t> d, Vector<float, int64_t> e, Matrix<float, Layout::ColMajor, int64_t> Z, MemoryBlock<float, int64_t, true> &work)
+    {
+        assert(d.size() >= 0);
+        assert(d.size() == e.size() + 1);
+        assert(d.size() == Z.num_rows());
+        assert(d.size() == Z.num_columns());
+        assert(2*d.size()-2 == work.size());
+        // fortran code only supports stride 1
+        assert(d.stride() == 1);
+        assert(e.stride() == 1);
+        lapack_idx info;
+        lapack_c_ssteqr((char)compz, d.size(), d.ptr(), e.ptr(), Z.ptr(), Z.ldim(), work.ptr(), &info);
+
+        return info;
+    }
+
+    template <>
+    int steqr(CompQ compz, Vector<double, int64_t> d, Vector<double, int64_t> e, Matrix<double, Layout::ColMajor, int64_t> Z, MemoryBlock<double, int64_t, true> &work)
+    {
+        assert(d.size() >= 0);
+        assert(d.size() == e.size() + 1);
+        assert(d.size() == Z.num_rows());
+        assert(d.size() == Z.num_columns());
+        assert(2*d.size()-2 == work.size());
+        // fortran code only supports stride 1
+        assert(d.stride() == 1);
+        assert(e.stride() == 1);
+        lapack_idx info;
+        lapack_c_dsteqr((char)compz, d.size(), d.ptr(), e.ptr(), Z.ptr(), Z.ldim(), work.ptr(), &info);
+
+        return info;
+    }
+
+
+    template <>
+    int steqr(CompQ compz, Vector<float, int64_t> d, Vector<float, int64_t> e, Matrix<std::complex<float>, Layout::ColMajor, int64_t> Z, MemoryBlock<float, int64_t, true> &work)
+    {
+        assert(d.size() >= 0);
+        assert(d.size() == e.size() + 1);
+        assert(d.size() == Z.num_rows());
+        assert(d.size() == Z.num_columns());
+        assert(2*d.size()-2 == work.size());
+        // fortran code only supports stride 1
+        assert(d.stride() == 1);
+        assert(e.stride() == 1);
+        lapack_idx info;
+        lapack_c_csteqr((char)compz, d.size(), d.ptr(), e.ptr(), Z.ptr(), Z.ldim(), work.ptr(), &info);
+
+        return info;
+    }
+
+    template <>
+    int steqr(CompQ compz, Vector<double, int64_t> d, Vector<double, int64_t> e, Matrix<std::complex<double>, Layout::ColMajor, int64_t> Z, MemoryBlock<double, int64_t, true> &work)
+    {
+        assert(d.size() >= 0);
+        assert(d.size() == e.size() + 1);
+        assert(d.size() == Z.num_rows());
+        assert(d.size() == Z.num_columns());
+        assert(2*d.size()-2 == work.size());
+        // fortran code only supports stride 1
+        assert(d.stride() == 1);
+        assert(e.stride() == 1);
+        lapack_idx info;
+        lapack_c_zsteqr((char)compz, d.size(), d.ptr(), e.ptr(), Z.ptr(), Z.ldim(), work.ptr(), &info);
+
+        return info;
+    }
+
+} // namespace lapack_cpp
\ No newline at end of file