From 5b521e6ccc514cb85ef579b167ef1bf979b66872 Mon Sep 17 00:00:00 2001 From: thijssteel Date: Thu, 29 Feb 2024 16:04:38 +0100 Subject: [PATCH 01/10] copy over files from lapackv4 repo --- LAPACK_CPP/Makefile | 43 + LAPACK_CPP/example/.gitignore | 2 + LAPACK_CPP/example/profile_lasr3.cpp | 99 +++ LAPACK_CPP/example/test_lasr3.cpp | 115 +++ LAPACK_CPP/include/lapack_c.h | 13 + LAPACK_CPP/include/lapack_c/lartg.h | 22 + LAPACK_CPP/include/lapack_c/lasr3.h | 94 ++ LAPACK_CPP/include/lapack_c/rot.h | 46 + LAPACK_CPP/include/lapack_c/util.h | 19 + LAPACK_CPP/include/lapack_cpp.hpp | 9 + LAPACK_CPP/include/lapack_cpp/base.hpp | 10 + LAPACK_CPP/include/lapack_cpp/base/enums.hpp | 84 ++ LAPACK_CPP/include/lapack_cpp/base/matrix.hpp | 334 +++++++ .../include/lapack_cpp/base/memory_block.hpp | 136 +++ LAPACK_CPP/include/lapack_cpp/base/types.hpp | 33 + LAPACK_CPP/include/lapack_cpp/base/vector.hpp | 203 +++++ LAPACK_CPP/include/lapack_cpp/blas.hpp | 6 + LAPACK_CPP/include/lapack_cpp/blas/rot.hpp | 24 + LAPACK_CPP/include/lapack_cpp/lapack.hpp | 7 + .../include/lapack_cpp/lapack/lartg.hpp | 16 + .../include/lapack_cpp/lapack/lasr3.hpp | 815 ++++++++++++++++++ LAPACK_CPP/include/lapack_cpp/utils.hpp | 87 ++ LAPACK_CPP/src/lapack_c/lartg_c.f90 | 53 ++ LAPACK_CPP/src/lapack_c/lasr3_c.cpp | 206 +++++ LAPACK_CPP/src/lapack_c/rot_c.f90 | 57 ++ LAPACK_CPP/src/lapack_cpp/lartg_cpp.cpp | 43 + LAPACK_CPP/src/lapack_cpp/rot_cpp.cpp | 67 ++ 27 files changed, 2643 insertions(+) create mode 100644 LAPACK_CPP/Makefile create mode 100644 LAPACK_CPP/example/.gitignore create mode 100644 LAPACK_CPP/example/profile_lasr3.cpp create mode 100644 LAPACK_CPP/example/test_lasr3.cpp create mode 100644 LAPACK_CPP/include/lapack_c.h create mode 100644 LAPACK_CPP/include/lapack_c/lartg.h create mode 100644 LAPACK_CPP/include/lapack_c/lasr3.h create mode 100644 LAPACK_CPP/include/lapack_c/rot.h create mode 100644 LAPACK_CPP/include/lapack_c/util.h create mode 100644 LAPACK_CPP/include/lapack_cpp.hpp create mode 100644 LAPACK_CPP/include/lapack_cpp/base.hpp create mode 100644 LAPACK_CPP/include/lapack_cpp/base/enums.hpp create mode 100644 LAPACK_CPP/include/lapack_cpp/base/matrix.hpp create mode 100644 LAPACK_CPP/include/lapack_cpp/base/memory_block.hpp create mode 100644 LAPACK_CPP/include/lapack_cpp/base/types.hpp create mode 100644 LAPACK_CPP/include/lapack_cpp/base/vector.hpp create mode 100644 LAPACK_CPP/include/lapack_cpp/blas.hpp create mode 100644 LAPACK_CPP/include/lapack_cpp/blas/rot.hpp create mode 100644 LAPACK_CPP/include/lapack_cpp/lapack.hpp create mode 100644 LAPACK_CPP/include/lapack_cpp/lapack/lartg.hpp create mode 100644 LAPACK_CPP/include/lapack_cpp/lapack/lasr3.hpp create mode 100644 LAPACK_CPP/include/lapack_cpp/utils.hpp create mode 100644 LAPACK_CPP/src/lapack_c/lartg_c.f90 create mode 100644 LAPACK_CPP/src/lapack_c/lasr3_c.cpp create mode 100644 LAPACK_CPP/src/lapack_c/rot_c.f90 create mode 100644 LAPACK_CPP/src/lapack_cpp/lartg_cpp.cpp create mode 100644 LAPACK_CPP/src/lapack_cpp/rot_cpp.cpp diff --git a/LAPACK_CPP/Makefile b/LAPACK_CPP/Makefile new file mode 100644 index 000000000..6d0d8eb0c --- /dev/null +++ b/LAPACK_CPP/Makefile @@ -0,0 +1,43 @@ +CXX=g++ +# CXXFLAGS=-std=c++17 -Wall -Wextra -pedantic -g -Og +CXXFLAGS=-std=c++17 -O3 -march=native -ffast-math -g -DNDEBUG + +FC=gfortran +FFLAGS=-Wall -Wextra -pedantic -g + +all: ./example/test_lasr3 ./example/profile_lasr3 + +HEADERS = $(wildcard lapack_c/include/*.h) \ + $(wildcard lapack_c/include/*/*.h) \ + $(wildcard lapack_c/include/*/*/*.h) \ + $(wildcard lapack_cpp/include/*.hpp) \ + $(wildcard lapack_cpp/include/*/*.hpp) \ + $(wildcard lapack_cpp/include/*/*/*.hpp) + +OBJFILES = src/lapack_c/rot_c.o \ + src/lapack_c/lartg_c.o \ + src/lapack_c/lasr3_c.o \ + src/lapack_cpp/rot_cpp.o \ + src/lapack_cpp/lartg_cpp.o \ + +# C files + +src/lapack_c/%.o: src/lapack_c/%.cpp + $(CXX) $(CXXFLAGS) -I./include -c -o $@ $< + +src/lapack_c/%.o: src/lapack_c/%.f90 + $(FC) $(FFLAGS) -c -o $@ $< + +# C++ files +src/lapack_cpp/%.o: src/lapack_cpp/%.cpp $(HEADERS) + $(CXX) $(CXXFLAGS) -I./include -c -o $@ $< + +# Test files +example/test_lasr3: example/test_lasr3.cpp $(OBJFILES) $(HEADERS) + $(CXX) $(CXXFLAGS) -o $@ $< $(OBJFILES) -I./include -lstdc++ -lgfortran -lblas -llapack + +example/profile_lasr3: example/profile_lasr3.cpp $(OBJFILES) $(HEADERS) + $(CXX) $(CXXFLAGS) -o $@ $< $(OBJFILES) -I./include -lstdc++ -lgfortran -lblas -llapack + +clean: + find . -type f -name '*.o' -delete \ No newline at end of file diff --git a/LAPACK_CPP/example/.gitignore b/LAPACK_CPP/example/.gitignore new file mode 100644 index 000000000..ed20f170e --- /dev/null +++ b/LAPACK_CPP/example/.gitignore @@ -0,0 +1,2 @@ +test_lasr3 +profile_lasr3 \ No newline at end of file diff --git a/LAPACK_CPP/example/profile_lasr3.cpp b/LAPACK_CPP/example/profile_lasr3.cpp new file mode 100644 index 000000000..456af8df8 --- /dev/null +++ b/LAPACK_CPP/example/profile_lasr3.cpp @@ -0,0 +1,99 @@ + +#include // for high_resolution_clock +#include +#include + +#include "lapack_cpp.hpp" + +using namespace lapack_cpp; + +template +void profile_lasr3(idx_t m, idx_t n, idx_t k, Side side, Direction direction) +{ + idx_t n_rot = side == Side::Left ? m - 1 : n - 1; + + MemoryBlock A_(m, n, layout); + Matrix A(m, n, A_); + + MemoryBlock, idx_t> C_(n_rot, k, layout); + Matrix, layout, idx_t> C(n_rot, k, C_); + + MemoryBlock S_(n_rot, k, layout); + Matrix S(n_rot, k, S_); + + randomize(A); + + // Generate random, but valid rotations + randomize(C); + randomize(S); + for (idx_t i = 0; i < n_rot; ++i) { + for (idx_t j = 0; j < k; ++j) { + T f = C(i, j); + T g = S(i, j); + T r; + lartg(f, g, C(i, j), S(i, j), r); + } + } + + MemoryBlock A_copy_(m, n, layout); + Matrix A_copy(m, n, A_copy_); + + A_copy = A; + + MemoryBlock work( + lasr3_workquery(side, direction, C.as_const(), S.as_const(), A)); + + const idx_t n_timings = 100; + const idx_t n_warmup = 20; + std::vector timings(n_timings); + + for (idx_t i = 0; i < n_timings; ++i) { + A = A_copy; + auto start = std::chrono::high_resolution_clock::now(); + lasr3(side, direction, C.as_const(), S.as_const(), A, work); + auto end = std::chrono::high_resolution_clock::now(); + timings[i] = std::chrono::duration(end - start).count(); + } + + float mean = 0; + for (idx_t i = n_warmup; i < n_timings; ++i) + mean += timings[i]; + mean /= n_timings - n_warmup; + + long nflops = + side == Side::Left ? 6. * (m - 1) * n * k : 6. * (n - 1) * m * k; + + std::cout << "m = " << m << ", n = " << n << ", k = " << k + << ", side = " << (char)side + << ", direction = " << (char)direction << ", mean time = " << mean + << " s" + << ", flop rate = " << nflops / mean * 1.0e-9 << " GFlops" + << std::endl; +} + +int main() +{ + typedef lapack_idx_t idx_t; + typedef double T; + + const idx_t nb = 1000; + const idx_t k = 64; + + for (int s = 0; s < 2; ++s) { + Side side = s == 0 ? Side::Left : Side::Right; + for (int d = 0; d < 2; ++d) { + Direction direction = + d == 0 ? Direction::Forward : Direction::Backward; + for (idx_t n_rot = 99; n_rot <= 1600; n_rot += 100) { + idx_t m = side == Side::Left ? n_rot + 1 : nb; + idx_t n = side == Side::Left ? nb : n_rot + 1; + + profile_lasr3(m, n, k, side, + direction); + } + } + } + return 0; +} \ No newline at end of file diff --git a/LAPACK_CPP/example/test_lasr3.cpp b/LAPACK_CPP/example/test_lasr3.cpp new file mode 100644 index 000000000..98f5d29a5 --- /dev/null +++ b/LAPACK_CPP/example/test_lasr3.cpp @@ -0,0 +1,115 @@ + +#include +#include + +#include "lapack_cpp.hpp" + +using namespace lapack_cpp; + +template +void test_lasr3(idx_t m, idx_t n, idx_t k, Side side, Direction direction) +{ + idx_t n_rot = side == Side::Left ? m - 1 : n - 1; + + MemoryBlock A_(m, n, layout); + Matrix A(m, n, A_); + + MemoryBlock, idx_t> C_(n_rot, k, layout); + Matrix, layout, idx_t> C(n_rot, k, C_); + + MemoryBlock S_(n_rot, k, layout); + Matrix S(n_rot, k, S_); + + randomize(A); + + // Generate random, but valid rotations + randomize(C); + randomize(S); + for (idx_t i = 0; i < n_rot; ++i) { + for (idx_t j = 0; j < k; ++j) { + T f = C(i, j); + T g = S(i, j); + T r; + lartg(f, g, C(i, j), S(i, j), r); + } + } + + MemoryBlock A_copy_(m, n, layout); + Matrix A_copy(m, n, A_copy_); + + A_copy = A; + + // Apply rotations using simple loops as test + if (side == Side::Left) { + if (direction == Direction::Forward) { + for (idx_t i = 0; i < k; ++i) + for (idx_t j = 0; j < n_rot; ++j) + rot(A_copy.row(j), A_copy.row(j + 1), C(j, i), S(j, i)); + } + else { + for (idx_t i = 0; i < k; ++i) + for (idx_t j = n_rot - 1; j >= 0; --j) + rot(A_copy.row(j), A_copy.row(j + 1), C(j, i), S(j, i)); + } + } + else { + if (direction == Direction::Forward) { + for (idx_t i = 0; i < k; ++i) + for (idx_t j = 0; j < n_rot; ++j) + rot(A_copy.column(j), A_copy.column(j + 1), C(j, i), + conj(S(j, i))); + } + else { + for (idx_t i = 0; i < k; ++i) + for (idx_t j = n_rot - 1; j >= 0; --j) + rot(A_copy.column(j), A_copy.column(j + 1), C(j, i), + conj(S(j, i))); + } + } + + lasr3(side, direction, C.as_const(), S.as_const(), A); + + // Check that the result is the same + for (idx_t i = 0; i < m; ++i) + for (idx_t j = 0; j < n; ++j) + A_copy(i, j) -= A(i, j); + + real_t err = 0.; + for (idx_t i = 0; i < m; ++i) + for (idx_t j = 0; j < n; ++j) + err = std::max(err, abs(A_copy(i, j))); + + // This test should really be relative + if( err > 1e-5 ){ + std::cout << "Failed test_lasr3 with parameters: " << m << ", " << n << ", " << k << ", " << (char)side << ", " << (char)direction << std::endl; + std::cout << "Error: " << err << std::endl; + } else { + std::cout << "Passed test_lasr3 with parameters: " << m << ", " << n << ", " << k << ", " << (char)side << ", " << (char)direction << std::endl; + } + + // print(A_copy); +} + +int main() +{ + typedef lapack_idx_t idx_t; + typedef std::complex T; + + for (idx_t nb = 64; nb <= 2000; nb *= 2) { + for (idx_t n_rot = 2; n_rot <= 2000; n_rot*=2) { + for (idx_t k = 32; k <= 128; k+=32) { + test_lasr3( + n_rot + 1, nb, k, Side::Left, Direction::Forward); + test_lasr3( + nb, n_rot + 1, k, Side::Right, Direction::Forward); + test_lasr3( + n_rot + 1, nb, k, Side::Left, Direction::Backward); + test_lasr3( + nb, n_rot + 1, k, Side::Right, Direction::Backward); + } + } + } + return 0; +} \ No newline at end of file diff --git a/LAPACK_CPP/include/lapack_c.h b/LAPACK_CPP/include/lapack_c.h new file mode 100644 index 000000000..4212a73f4 --- /dev/null +++ b/LAPACK_CPP/include/lapack_c.h @@ -0,0 +1,13 @@ +#ifndef LAPACK_C_HPP +#define LAPACK_C_HPP + +#include "lapack_c/util.h" + +// BLAS +#include "lapack_c/rot.h" + +// LAPACK +#include "lapack_c/lartg.h" + + +#endif \ No newline at end of file diff --git a/LAPACK_CPP/include/lapack_c/lartg.h b/LAPACK_CPP/include/lapack_c/lartg.h new file mode 100644 index 000000000..9b23aa60d --- /dev/null +++ b/LAPACK_CPP/include/lapack_c/lartg.h @@ -0,0 +1,22 @@ +#ifndef LAPACK_C_LARTG_HPP +#define LAPACK_C_LARTG_HPP + +#include "lapack_c/util.h" + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +void lapack_c_slartg(float f, float g, float* cs, float* sn, float* r); + +void lapack_c_dlartg(double f, double g, double* cs, double* sn, double* r); + +void lapack_c_clartg(lapack_float_complex f, lapack_float_complex g, float* cs, lapack_float_complex* sn, lapack_float_complex* r); + +void lapack_c_zlartg(lapack_double_complex f, lapack_double_complex g, double* cs, lapack_double_complex* sn, lapack_double_complex* r); + +#ifdef __cplusplus +} +#endif // __cplusplus + +#endif \ No newline at end of file diff --git a/LAPACK_CPP/include/lapack_c/lasr3.h b/LAPACK_CPP/include/lapack_c/lasr3.h new file mode 100644 index 000000000..efd86a916 --- /dev/null +++ b/LAPACK_CPP/include/lapack_c/lasr3.h @@ -0,0 +1,94 @@ +#ifndef LAPACK_C_ROT_HPP +#define LAPACK_C_ROT_HPP + +#include "lapack_c/util.h" + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +void lapack_c_slasr3(char side, + char direct, + lapack_idx m, + lapack_idx n, + lapack_idx k, + const float* C, + lapack_idx ldc, + const float* S, + lapack_idx lds, + float* A, + lapack_idx lda, + float* work, + lapack_idx lwork); + +void lapack_c_dlasr3(char side, + char direct, + lapack_idx m, + lapack_idx n, + lapack_idx k, + const double* C, + lapack_idx ldc, + const double* S, + lapack_idx lds, + double* A, + lapack_idx lda, + double* work, + lapack_idx lwork); + +void lapack_c_clasr3(char side, + char direct, + lapack_idx m, + lapack_idx n, + lapack_idx k, + const float* C, + lapack_idx ldc, + const lapack_float_complex* S, + lapack_idx lds, + lapack_float_complex* A, + lapack_idx lda, + lapack_float_complex* work, + lapack_idx lwork); + +void lapack_c_sclasr3(char side, + char direct, + lapack_idx m, + lapack_idx n, + lapack_idx k, + const float* C, + lapack_idx ldc, + const float* S, + lapack_idx lds, + lapack_float_complex* A, + lapack_idx lda, + lapack_float_complex* work, + lapack_idx lwork); + +void lapack_c_zlasr3(lapack_idx m, + lapack_idx n, + lapack_idx k, + const double* C, + lapack_idx ldc, + const lapack_double_complex* S, + lapack_idx lds, + lapack_double_complex* A, + lapack_idx lda, + lapack_double_complex* work, + lapack_idx lwork); + +void lapack_c_dzlasr3(lapack_idx m, + lapack_idx n, + lapack_idx k, + const double* C, + lapack_idx ldc, + const double* S, + lapack_idx lds, + lapack_double_complex* A, + lapack_idx lda, + lapack_double_complex* work, + lapack_idx lwork); + +#ifdef __cplusplus +} +#endif // __cplusplus + +#endif \ No newline at end of file diff --git a/LAPACK_CPP/include/lapack_c/rot.h b/LAPACK_CPP/include/lapack_c/rot.h new file mode 100644 index 000000000..0c180a76a --- /dev/null +++ b/LAPACK_CPP/include/lapack_c/rot.h @@ -0,0 +1,46 @@ +#ifndef LAPACK_C_ROT_HPP +#define LAPACK_C_ROT_HPP + +#include "lapack_c/util.h" + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +void lapack_c_drot(lapack_idx n, + double* x, + lapack_idx incx, + double* y, + lapack_idx incy, + double c, + double s); + +void lapack_c_srot(lapack_idx n, + float* x, + lapack_idx incx, + float* y, + lapack_idx incy, + float c, + float s); + +void lapack_c_crot(lapack_idx n, + lapack_float_complex* x, + lapack_idx incx, + lapack_float_complex* y, + lapack_idx incy, + float c, + lapack_float_complex s); + +void lapack_c_zrot(lapack_idx n, + lapack_double_complex* x, + lapack_idx incx, + lapack_double_complex* y, + lapack_idx incy, + double c, + lapack_double_complex s); + +#ifdef __cplusplus +} +#endif // __cplusplus + +#endif \ No newline at end of file diff --git a/LAPACK_CPP/include/lapack_c/util.h b/LAPACK_CPP/include/lapack_c/util.h new file mode 100644 index 000000000..9a874ae37 --- /dev/null +++ b/LAPACK_CPP/include/lapack_c/util.h @@ -0,0 +1,19 @@ +#ifndef LAPACK_C_UTIL_H +#define LAPACK_C_UTIL_H + +// Define types for complex number +#ifdef __cplusplus +#include +#define lapack_float_complex std::complex +#define lapack_double_complex std::complex +#else +#define lapack_float_complex float _Complex +#define lapack_double_complex double _Complex +#endif // __cplusplus + +// Default to 32 bit integer if not yet defined +#ifndef lapack_idx +#define lapack_idx int +#endif // lapack_idx + +#endif \ No newline at end of file diff --git a/LAPACK_CPP/include/lapack_cpp.hpp b/LAPACK_CPP/include/lapack_cpp.hpp new file mode 100644 index 000000000..d54c2b9d8 --- /dev/null +++ b/LAPACK_CPP/include/lapack_cpp.hpp @@ -0,0 +1,9 @@ +#ifndef LAPACK_CPP_HPP +#define LAPACK_CPP_HPP + +#include "lapack_cpp/base.hpp" +#include "lapack_cpp/blas.hpp" +#include "lapack_cpp/lapack.hpp" +#include "lapack_cpp/utils.hpp" + +#endif // LAPACK_CPP_HPP diff --git a/LAPACK_CPP/include/lapack_cpp/base.hpp b/LAPACK_CPP/include/lapack_cpp/base.hpp new file mode 100644 index 000000000..ec7dd5e49 --- /dev/null +++ b/LAPACK_CPP/include/lapack_cpp/base.hpp @@ -0,0 +1,10 @@ +#ifndef LAPACK_CPP_BASE_HPP +#define LAPACK_CPP_BASE_HPP + +#include "lapack_cpp/base/types.hpp" +#include "lapack_cpp/base/enums.hpp" +#include "lapack_cpp/base/matrix.hpp" +#include "lapack_cpp/base/memory_block.hpp" +#include "lapack_cpp/base/vector.hpp" + +#endif // LAPACK_CPP_BASE_HPP \ No newline at end of file diff --git a/LAPACK_CPP/include/lapack_cpp/base/enums.hpp b/LAPACK_CPP/include/lapack_cpp/base/enums.hpp new file mode 100644 index 000000000..d3f929aca --- /dev/null +++ b/LAPACK_CPP/include/lapack_cpp/base/enums.hpp @@ -0,0 +1,84 @@ +#ifndef LAPACK_CPP_ENUMP_HPP +#define LAPACK_CPP_ENUMP_HPP + +#include + +namespace lapack_cpp { + +enum class Layout : char { RowMajor = 'R', ColMajor = 'C' }; + +enum class Op : char { NoTrans = 'N', Trans = 'T', ConjTrans = 'C' }; + +inline constexpr Op char2op(char t) +{ + switch (t) { + case 'N': + return Op::NoTrans; + case 'T': + return Op::Trans; + case 'C': + return Op::ConjTrans; + default: + assert(false); + } +} + +enum class Side : char { Left = 'L', Right = 'R' }; + +inline constexpr Side char2side(char t) +{ + switch (t) { + case 'L': + return Side::Left; + case 'R': + return Side::Right; + default: + assert(false); + } +} + +enum class Uplo : char { Upper = 'U', Lower = 'L' }; + +inline constexpr Uplo char2uplo(char t) +{ + switch (t) { + case 'U': + return Uplo::Upper; + case 'R': + return Uplo::Lower; + default: + assert(false); + } +} + +enum class Diag : char { Unit = 'U', NonUnit = 'N' }; + +inline constexpr Diag char2diag(char t) +{ + switch (t) { + case 'U': + return Diag::Unit; + case 'N': + return Diag::NonUnit; + default: + assert(false); + } +} + +enum class Direction : char { Forward = 'F', Backward = 'B' }; + +inline constexpr Direction char2direct(char t) +{ + switch (t) { + case 'F': + return Direction::Forward; + case 'N': + return Direction::Backward; + default: + assert(false); + } +} + +} // namespace lapack_cpp + +#endif // LAPACK_CPP_ENUMP_HPP \ No newline at end of file diff --git a/LAPACK_CPP/include/lapack_cpp/base/matrix.hpp b/LAPACK_CPP/include/lapack_cpp/base/matrix.hpp new file mode 100644 index 000000000..ad10aae8c --- /dev/null +++ b/LAPACK_CPP/include/lapack_cpp/base/matrix.hpp @@ -0,0 +1,334 @@ + +#ifndef LAPACK_CPP_MATRIX_HPP +#define LAPACK_CPP_MATRIX_HPP + +#include +#include +#include +#include +#include +#include + +#include "lapack_cpp/base/types.hpp" +#include "lapack_cpp/base/enums.hpp" +#include "lapack_cpp/base/memory_block.hpp" +#include "lapack_cpp/base/vector.hpp" + +namespace lapack_cpp { + +// Forward declaration of ConstMatrix +template +class ConstMatrix; + +/** + * A matrix class that can be used to store matrices of arbitrary size. + * + * The elements of the matrix are stored in column-major order + * + * @tparam T this is a template parameter that specifies the type of the + * elements of the vector. + */ +template +class Matrix { + public: + typedef T value_type; + typedef idx_t index_type; + // Constructor for a matrix of size m x n + template + Matrix(const idx_t m, + const idx_t n, + const MemoryBlock& m_block) + : m_(m), + n_(n), + ldim_(calc_ld(layout == Layout::ColMajor ? m_ : n_)), + data_(m_block.ptr()) + { + assert(m_block.size() >= + ldim_ * (layout == Layout::ColMajor ? n_ : m_)); + } + + // Constructor for a matrix of size m x n + template + Matrix(const idx_t m, + const idx_t n, + const MemoryBlock& m_block, + const idx_t ldim) + : m_(m), n_(n), ldim_(ldim), data_(m_block.ptr()) + { + assert(m_block.size() >= + ldim_ * (layout == Layout::ColMajor ? n_ : m_)); + } + + // Constructor for a matrix of size m x n + Matrix(const idx_t m, const idx_t n, T* ptr, const idx_t ldim) + : m_(m), n_(n), ldim_(ldim), data_(ptr) + { + assert(ldim >= (layout == Layout::ColMajor ? m : n)); + } + + // Copy constructor + Matrix(const Matrix& m) + : m_(m.num_rows()), n_(m.num_columns()), ldim_(m.ldim()), data_(m.ptr()) + {} + + // Make a const promotion of the matrix + ConstMatrix as_const() const + { + return ConstMatrix(m_, n_, data_, ldim_); + } + + // Assignment operator + void operator=(const Matrix& m) + { + assert(m.num_rows() == m_); + assert(m.num_columns() == n_); + for (idx_t i = 0; i < m_; ++i) { + for (idx_t j = 0; j < n_; ++j) { + (*this)(i,j) = m(i, j); + } + } + } + + // Returns the number of rows of the matrix + inline idx_t num_rows() const { return m_; } + + // Returns the number of columns of the matrix + inline idx_t num_columns() const { return n_; } + + // Returns the leading dimension of the matrix + inline idx_t ldim() const { return ldim_; } + + // Returns the size of the matrix (i.e. the number of elements m * n) + inline idx_t size() const { return n_ * m_; } + + // Returns the ij-th element of the matrix + inline T& operator()(idx_t i, idx_t j) const + { + assert(i < m_); + assert(j < n_); + if (layout == Layout::ColMajor) + return data_[i + j * ldim_]; + else + return data_[i * ldim_ + j]; + } + + /** + * Return a submatrix of the matrix. + * + * @param i1 index of the first row of the submatrix + * @param i2 index of the last row of the submatrix (not inclusive) + * @param j1 index of the first column of the submatrix + * @param j2 index of the last column of the submatrix (not inclusive) + * + * @example + * Matrix A(5,5); + * auto B = A.submatrix(1, 4, 1, 4); // B is a 3x3 submatrix of A + */ + inline Matrix submatrix(idx_t i1, idx_t i2, idx_t j1, idx_t j2) const + { + assert(i1 >= 0); + assert(i2 <= m_); + assert(j1 >= 0); + assert(j2 <= n_); + assert(i1 < i2); + assert(j1 < j2); + if (layout == Layout::ColMajor) + return Matrix(i2 - i1, j2 - j1, &data_[i1 + j1 * ldim_], ldim_); + else + return Matrix(i2 - i1, j2 - j1, &data_[i1 * ldim_ + j1], ldim_); + } + + /** + * Return a column of the matrix as a vector + * + * @param i index of the column + */ + inline Vector column(idx_t i) const + { + assert(i >= 0); + assert(i < n_); + if (layout == Layout::ColMajor) + return Vector(m_, &data_[i * ldim_], 1); + else + return Vector(m_, &data_[i], ldim_); + } + + /** + * Return a row of the matrix as a vector + * + * @param i index of the row + */ + inline Vector row(idx_t i) const + { + assert(i >= 0); + assert(i < m_); + if (layout == Layout::ColMajor) + return Vector(n_, &data_[i], ldim_); + else + return Vector(n_, &data_[i * ldim_], 1); + } + + /** + * Return a pointer to the data of the matrix. + */ + inline T* ptr() const { return data_; } + + private: + // Number of rows of the matrix + const idx_t m_; + // Number of columns of the matrix + const idx_t n_; + // Leading dimension of the matrix + const idx_t ldim_; + // Pointer to the data + T* data_; +}; + +template +class ConstMatrix { + public: + typedef T value_type; + typedef idx_t index_type; + // Constructor for a matrix of size m x n + template + ConstMatrix(const idx_t m, + const idx_t n, + const MemoryBlock& m_block) + : m_(m), + n_(n), + ldim_(calc_ld(layout == Layout::ColMajor ? m_ : n_)), + data_(m_block.ptr()) + { + assert(m_block.size() >= + ldim_ * (layout == Layout::ColMajor ? n_ : m_)); + } + + // Constructor for a matrix of size m x n + template + ConstMatrix(const idx_t m, + const idx_t n, + const MemoryBlock& m_block, + const idx_t ldim) + : m_(m), n_(n), ldim_(ldim), data_(m_block.ptr()) + { + assert(m_block.size() >= + ldim_ * (layout == Layout::ColMajor ? n_ : m_)); + } + + // Constructor for a matrix of size m x n + ConstMatrix(const idx_t m, const idx_t n, const T* ptr, const idx_t ldim) + : m_(m), n_(n), ldim_(ldim), data_(ptr) + { + assert(ldim >= (layout == Layout::ColMajor ? m : n)); + } + + // Copy constructor + ConstMatrix(const ConstMatrix& m) + : m_(m.num_rows()), n_(m.num_columns()), ldim_(m.ldim()), data_(m.ptr()) + {} + + // Const promotion constructor + ConstMatrix(const Matrix& m) + : m_(m.num_rows()), n_(m.num_columns()), ldim_(m.ldim()), data_(m.ptr()) + {} + + // Returns the number of rows of the matrix + inline idx_t num_rows() const { return m_; } + + // Returns the number of columns of the matrix + inline idx_t num_columns() const { return n_; } + + // Returns the leading dimension of the matrix + inline idx_t ldim() const { return ldim_; } + + // Returns the size of the matrix (i.e. the number of elements m * n) + inline idx_t size() const { return n_ * m_; } + + // Returns the ij-th element of the matrix + inline T operator()(idx_t i, idx_t j) const + { + assert(i < m_); + assert(j < n_); + if (layout == Layout::ColMajor) + return data_[i + j * ldim_]; + else + return data_[i * ldim_ + j]; + } + + /** + * Return a submatrix of the matrix. + * + * @param i1 index of the first row of the submatrix + * @param i2 index of the last row of the submatrix (not inclusive) + * @param j1 index of the first column of the submatrix + * @param j2 index of the last column of the submatrix (not inclusive) + * + * @example + * Matrix A(5,5); + * auto B = A.submatrix(1, 4, 1, 4); // B is a 3x3 submatrix of A + */ + inline ConstMatrix submatrix(idx_t i1, idx_t i2, idx_t j1, idx_t j2) const + { + assert(i1 >= 0); + assert(i2 <= m_); + assert(j1 >= 0); + assert(j2 <= n_); + assert(i1 < i2); + assert(j1 < j2); + if (layout == Layout::ColMajor) + return ConstMatrix(i2 - i1, j2 - j1, &data_[i1 + j1 * ldim_], + ldim_); + else + return ConstMatrix(i2 - i1, j2 - j1, &data_[i1 * ldim_ + j1], + ldim_); + } + + /** + * Return a column of matrix as a vector + * + * @param i index of the column + */ + inline ConstVector column(idx_t i) const + { + assert(i >= 0); + assert(i < n_); + if (layout == Layout::ColMajor) + return ConstVector(m_, &data_[i * ldim_], 1); + else + return ConstVector(m_, &data_[i], ldim_); + } + + /** + * Return a row of matrix as a vector + * + * @param i index of the row + */ + inline ConstVector row(idx_t i) const + { + assert(i >= 0); + assert(i < m_); + if (layout == Layout::ColMajor) + return ConstVector(n_, &data_[i], ldim_); + else + return ConstVector(n_, &data_[i * ldim_], 1); + } + + /** + * Return a pointer to the data of the matrix. + */ + inline const T* ptr() const { return data_; } + + private: + // Number of rows of the matrix + const idx_t m_; + // Number of columns of the matrix + const idx_t n_; + // Leading dimension of the matrix + const idx_t ldim_; + // Pointer to the data + const T* data_; +}; + +} // namespace lapack_cpp + +#endif // LAPACK_CPP_MATRIX_HPP \ No newline at end of file diff --git a/LAPACK_CPP/include/lapack_cpp/base/memory_block.hpp b/LAPACK_CPP/include/lapack_cpp/base/memory_block.hpp new file mode 100644 index 000000000..c2ccb90a2 --- /dev/null +++ b/LAPACK_CPP/include/lapack_cpp/base/memory_block.hpp @@ -0,0 +1,136 @@ + +#ifndef LAPACK_CPP_MEMORY_BLOCK_HPP +#define LAPACK_CPP_MEMORY_BLOCK_HPP + +#include +#include +#include +#include + +#include "lapack_cpp/base/types.hpp" + +namespace lapack_cpp { + +/** + * Calculate the leading dimension of a matrix, given the number of rows. + * Normally, the leading dimension can just be equal to to the number of rows, + * but for performance reasons, we want to make sure that the leading dimension + * is a multiple of 64 bytes. That way, if the first column is aligned, they all + * are. + * + * @tparam T + * @param m + * @return int + */ +template +inline constexpr idx_t calc_ld(const idx_t m) +{ + if (aligned) + return ((m - 1 + (64 / sizeof(T))) / (64 / sizeof(T))) * + (64 / sizeof(T)); + else + return m; +} + +/** + * A class representing a contiguous block of memory of fixed size. + * + * This class owns the data. It is responsible for allocating and deallocating + * the memory. + * + * A special note about const. Declaring a Memory block as const does not make + * the data const. For example: + * const MemoryBlock A(10); + * double* data = A.ptr(); + * data[0] = 1.0; // This is allowed + */ +template +class MemoryBlock { + public: + // Constructor for a block of size n + MemoryBlock(idx_t n) + : n_(n), + data_(aligned ? (T*)aligned_alloc(64, n_ * sizeof(T)) + : (T*)malloc(n_ * sizeof(T))) + { + assert(n >= 0); + #ifndef NDEBUG + // When debugging, fill the memory with NaNs + // This makes it easier to spot uninitialized memory + std::fill(data_, data_ + n_, NAN); + #endif + } + + // Constructor for a block of sufficient size to store a matrix of size m x + // n + MemoryBlock(idx_t m, idx_t n, Layout layout = Layout::ColMajor) + : n_(layout == Layout::ColMajor ? (calc_ld(m) * n) : (calc_ld(n) * m)), + data_(aligned ? (T*)aligned_alloc(64, n_ * sizeof(T)) + : (T*)malloc(n_ * sizeof(T))) + { + assert(m >= 0); + assert(n >= 0); + #ifndef NDEBUG + // When debugging, fill the memory with NaNs + // This makes it easier to spot uninitialized memory + std::fill(data_, data_ + n_, NAN); + #endif + } + + // Non-owning constructor + MemoryBlock(idx_t n, T* data) : n_(n), data_(data), owns_data_(false) + { + assert(n >= 0); + } + + // Destructor + ~MemoryBlock() + { + if (owns_data_) free((std::remove_const_t*)data_); + } + + // Copy constructor + MemoryBlock(const MemoryBlock& other) + : n_(other.n_), data_(other.data_), owns_data_(false) + {} + + // Assignment operator + void operator=(const MemoryBlock& other) + { + assert(other.size() == this->size()); + std::copy(other.data_, other.data_ + n_, data_); + } + + // Memory block that remains after creating a vector of size n + MemoryBlock remainder(idx_t n) const + { + idx_t size = calc_ld(n); + assert(size <= n_); + return MemoryBlock(n_ - n, data_ + n); + } + + // Memory block that remains after creating a vector of size n + MemoryBlock remainder(idx_t m, idx_t n) const + { + idx_t size = calc_ld(calc_ld(m) * n); + assert(size <= n_); + return MemoryBlock(n_ - size, data_ + size); + } + + // Returns the size of the memory block + inline idx_t size() const { return n_; } + + /** + * Return a pointer to the data of the vector. + */ + inline T* ptr() const { return data_; } + + private: + const idx_t n_; + T* data_; + const bool owns_data_ = true; +}; + +} // namespace lapack_cpp + +#endif \ No newline at end of file diff --git a/LAPACK_CPP/include/lapack_cpp/base/types.hpp b/LAPACK_CPP/include/lapack_cpp/base/types.hpp new file mode 100644 index 000000000..2ce546bae --- /dev/null +++ b/LAPACK_CPP/include/lapack_cpp/base/types.hpp @@ -0,0 +1,33 @@ +#ifndef LAPACK_CPP_BASE_TYPES_HPP +#define LAPACK_CPP_BASE_TYPES_HPP + +#include +#include +#include + +// While the library is templated and can handle different integers, +// this is the default integer type used by the library. +typedef int64_t lapack_idx_t; + +// declare conj for real types +template , bool> = true> +inline T conj(const T& x) +{ + return x; +} + +template +struct real_type_struct { + typedef T type; +}; + +template +struct real_type_struct> { + typedef T type; +}; + +template +using real_t = typename real_type_struct::type; + +#endif // LAPACK_CPP_BASE_TYPES_HPP \ No newline at end of file diff --git a/LAPACK_CPP/include/lapack_cpp/base/vector.hpp b/LAPACK_CPP/include/lapack_cpp/base/vector.hpp new file mode 100644 index 000000000..d0d723afd --- /dev/null +++ b/LAPACK_CPP/include/lapack_cpp/base/vector.hpp @@ -0,0 +1,203 @@ + +#ifndef LAPACK_CPP_VECTOR_HPP +#define LAPACK_CPP_VECTOR_HPP + +#include +#include +#include + +#include "lapack_cpp/base/types.hpp" +#include "lapack_cpp/base/enums.hpp" +#include "lapack_cpp/base/memory_block.hpp" + +namespace lapack_cpp { + +// Forward declaration of ConstVector +template +class ConstVector; + +/** + * A vector class that can be used to store vectors of arbitrary size. + * + * @tparam T this is a template parameter that specifies the type of the + * elements of the vector. + */ +template +class Vector { + public: + typedef T value_type; + typedef idx_t index_type; + + // Constructor for a vector of size n + template + Vector(idx_t n, + const MemoryBlock& m_block, + idx_t stride = 1) + : n_(n), stride_(stride), data_(m_block.ptr()) + { + assert(n >= 0); + assert(m_block.size() >= n); + } + + // Constructor for a vector of size n + Vector(idx_t n, T* data, idx_t stride = 1) + : n_(n), stride_(stride), data_(data) + { + assert(n >= 0); + } + + // Copy constructor + // Note: the default copy constructor would probably work, but we want to + // emphasize that we are doing a shallow copy here. + Vector(const Vector& v) : n_(v.n_), stride_(v.stride_), data_(v.data_) {} + + // Assignment operator + void operator=(const Vector& v) + { + assert(v.size() == this->size()); + for (int i = 0; i < this->size(); ++i) { + (*this)[i] = v[i]; + } + } + + // Make a const promotion of the vector + inline ConstVector as_const() const + { + return ConstVector(n_, data_, stride_); + } + + // Returns the size of the vector + inline idx_t size() const { return n_; } + + inline idx_t stride() const { return stride_; } + + // Returns the i-th element of the vector + // Note: this function will only be called when the vector is not const + // therefore, we return the element by reference. + inline T& operator[](const idx_t i) const + { + assert(i < n_); + return data_[i * stride_]; + } + + /** + * The subvector starts at index start and ends at index end - 1. + * The stride is the step size between two elements of the subvector. + */ + inline Vector subvector(idx_t start, idx_t end, idx_t stride = 1) const + { + assert(start >= 0); + assert(end <= n_); + assert(start < end); + assert(stride > 0); + return Vector((end - start) / stride, &data_[start * stride_], + stride * stride_); + } + + /** + * Return a pointer to the data of the vector. + */ + inline T* ptr() const { return data_; } + + private: + const idx_t n_; + const idx_t stride_; + T* data_; +}; + +/** + * A vector class that can be used to store vectors of arbitrary size. + * + * @tparam T this is a template parameter that specifies the type of the + * elements of the vector. + */ +template +class ConstVector { + public: + typedef T value_type; + typedef idx_t index_type; + // Constructor for a vector of size n + template + ConstVector(idx_t n, + const MemoryBlock& m_block, + idx_t stride = 1) + : n_(n), stride_(stride), data_(m_block.ptr()) + { + assert(n >= 0); + assert(m_block.size() >= n); + } + + // Constructor for a vector of size n + ConstVector(idx_t n, const T* data, idx_t stride = 1) + : n_(n), stride_(stride), data_(data) + { + assert(n >= 0); + } + + // Copy constructor + // Note: the default copy constructor would probably work, but we want to + // emphasize that we are doing a shallow copy here. + ConstVector(const ConstVector& v) + : n_(v.n_), stride_(v.stride_), data_(v.data_) + {} + + // Const promotion constructor + ConstVector(const Vector& v) + : n_(v.size()), stride_(v.stride()), data_(v.ptr()) + {} + + // Assignment operator + void operator=(const Vector& v) + { + assert(v.size() == this->size()); + for (int i = 0; i < this->size(); ++i) { + (*this)[i] = v[i]; + } + } + + // Returns the size of the vector + inline idx_t size() const { return n_; } + + inline idx_t stride() const { return stride_; } + + // Returns the i-th element of the vector + // Note: this function will only be called when the vector is not const + // therefore, we return the element by reference. + inline T operator[](const idx_t i) const + { + assert(i < n_); + return data_[i * stride_]; + } + + /** + * The subvector starts at index start and ends at index end - 1. + * The stride is the step size between two elements of the subvector. + * + * Note: this function will also be called if the matrix is const, even + * though the returned object is not const. Solving this problem would + * require either an inefficient or a more complex design. + */ + inline ConstVector subvector(idx_t start, idx_t end, idx_t stride = 1) const + { + assert(start >= 0); + assert(end <= n_); + assert(start < end); + assert(stride > 0); + return ConstVector((end - start) / stride, &data_[start * stride_], + stride * stride_); + } + + /** + * Return a pointer to the data of the vector. + */ + inline const T* ptr() const { return data_; } + + private: + const idx_t n_; + const idx_t stride_; + const T* data_; +}; + +} // namespace lapack_cpp + +#endif \ No newline at end of file diff --git a/LAPACK_CPP/include/lapack_cpp/blas.hpp b/LAPACK_CPP/include/lapack_cpp/blas.hpp new file mode 100644 index 000000000..b2d996ee0 --- /dev/null +++ b/LAPACK_CPP/include/lapack_cpp/blas.hpp @@ -0,0 +1,6 @@ +#ifndef LAPACK_CPP_BLAS_HPP +#define LAPACK_CPP_BLAS_HPP + +#include "lapack_cpp/blas/rot.hpp" + +#endif // LAPACK_CPP_BLAS_HPP \ No newline at end of file diff --git a/LAPACK_CPP/include/lapack_cpp/blas/rot.hpp b/LAPACK_CPP/include/lapack_cpp/blas/rot.hpp new file mode 100644 index 000000000..29db48f66 --- /dev/null +++ b/LAPACK_CPP/include/lapack_cpp/blas/rot.hpp @@ -0,0 +1,24 @@ +#ifndef LAPACK_CPP_ROT_HPP +#define LAPACK_CPP_ROT_HPP + +#include "lapack_cpp/base.hpp" +#include "lapack_cpp/utils.hpp" +namespace lapack_cpp { + +/** + * Apply a plane rotation to a pair of vectors. + */ +template +void rot(const Vector& x, const Vector& y, real_t c, TS s){ + assert(x.size() == y.size()); + const idx_t n = x.size(); + for (idx_t i = 0; i < n; ++i) { + T temp = -conj(s) * x[i] + c * y[i]; + x[i] = c * x[i] + s * y[i]; + y[i] = temp; + } +} + +} // namespace lapack_cpp + +#endif // LAPACK_CPP_GEMV_HPP \ No newline at end of file diff --git a/LAPACK_CPP/include/lapack_cpp/lapack.hpp b/LAPACK_CPP/include/lapack_cpp/lapack.hpp new file mode 100644 index 000000000..31e03ce78 --- /dev/null +++ b/LAPACK_CPP/include/lapack_cpp/lapack.hpp @@ -0,0 +1,7 @@ +#ifndef LAPACK_CPP_LAPACK_HPP +#define LAPACK_CPP_LAPACK_HPP + +#include "lapack_cpp/lapack/lartg.hpp" +#include "lapack_cpp/lapack/lasr3.hpp" + +#endif // LAPACK_CPP_LAPACK_HPP \ No newline at end of file diff --git a/LAPACK_CPP/include/lapack_cpp/lapack/lartg.hpp b/LAPACK_CPP/include/lapack_cpp/lapack/lartg.hpp new file mode 100644 index 000000000..5f5b345dd --- /dev/null +++ b/LAPACK_CPP/include/lapack_cpp/lapack/lartg.hpp @@ -0,0 +1,16 @@ +#ifndef LAPACK_CPP_LARTG_HPP +#define LAPACK_CPP_LARTG_HPP + +#include "lapack_cpp/base.hpp" +#include "lapack_cpp/utils.hpp" +namespace lapack_cpp { + +/** + * Generate a plane rotation. + */ +template +void lartg(T f, T g, real_t& c, T& s, T& r); + +} // namespace lapack_cpp + +#endif // LAPACK_CPP_GEMV_HPP \ No newline at end of file diff --git a/LAPACK_CPP/include/lapack_cpp/lapack/lasr3.hpp b/LAPACK_CPP/include/lapack_cpp/lapack/lasr3.hpp new file mode 100644 index 000000000..0a3ffbb8a --- /dev/null +++ b/LAPACK_CPP/include/lapack_cpp/lapack/lasr3.hpp @@ -0,0 +1,815 @@ +#ifndef LAPACK_CPP_LASR3_HPP +#define LAPACK_CPP_LASR3_HPP + +#include "lapack_cpp/base.hpp" +#include "lapack_cpp/blas/rot.hpp" +#include "lapack_cpp/utils.hpp" + +namespace lapack_cpp { + +/** + * Applies a sequence of plane rotations to a general rectangular matrix. + * + * @param side Specifies whether the plane rotations are applied to A on the + * left or right. + * + * @param direct Specifies whether the sequence is applied in forward or + * backward order. + * + * @param C The cosines of the rotations + * If side == Side::Left, C is an array of dimension m-1 x k + * If side == Side::Right, C is an array of dimension n-1 x k + * + * @param S The sines of the rotations + * If side == Side::Left, S is an array of dimension m-1 x k + * If side == Side::Right, S is an array of dimension n-1 x k + * + * @param A m x n matrix to which the rotations are to be applied. + * + * @param work A workspace array, its dimension is specified by lasr3_work. + * Note, because we use the work array to align the data for + * efficient vectorization, this routine only support aligned + * work arrays. + * + * @tparam T The type of the elements of the matrix A. + * + * @tparam TC The type of the elements of the matrix C. + * + * @tparam TS The type of the elements of the matrix S. + * + * @tparam layout The layout of the matrices A, C, and S. + * + * @tparam idx_t The type of the indices. + * + * @note if either TC or TS is a complex type, then T must also be a complex + * type. + * + */ +template +void lasr3(Side side, + Direction direct, + ConstMatrix C, + ConstMatrix S, + Matrix A, + MemoryBlock work); + +/** + * Workspace query for lasr3. + */ +template +idx_t lasr3_workquery(Side side, + Direction direct, + ConstMatrix C, + ConstMatrix S, + Matrix A) +{ + const idx_t m = A.num_rows(); + const idx_t n = A.num_columns(); + const idx_t k = C.num_columns(); + const idx_t num_rot = C.num_rows(); + + assert(side == Side::Left || side == Side::Right); + assert(direct == Direction::Forward || direct == Direction::Backward); + assert(C.num_rows() == S.num_rows()); + assert(C.num_columns() == S.num_columns()); + assert(side == Side::Left ? m : n == C.num_rows() + 1); + + // Blocksize for the number of rows/columns of A to be processed at once + const idx_t block_mn = 256; + // Blocksize for the number of rotation sequences to be processed at once + const idx_t block_k = 64; + const idx_t block_k2 = std::min(num_rot - 3, block_k); + + const idx_t nm2 = std::min(block_mn, side == Side::Left ? n : m); + const idx_t k2 = std::min(block_k2, k); + + return calc_ld(nm2) * (k2 + 2); +} + +/** + * @copydoc lasr3 + */ +template +void lasr3(Side side, + Direction direct, + ConstMatrix C, + ConstMatrix S, + Matrix A) +{ + idx_t lwork = lasr3_workquery(side, direct, C, S, A); + MemoryBlock work(lwork); + lasr3(side, direct, C, S, A, work); +} + +namespace internal { + + /** Applies two plane rotations to three vectors. + * + * [x1] [1 0 0 ] [c1 s1 0] [x1] + * [x2] = [0 c2 s2 ] * [-s1 c1 0] * [x2] + * [x3] [0 -s2 c2 ] [0 0 1] [x3] + * + * @param x1 The first vector. + * @param x2 The second vector. + * @param x3 The third vector. + * @param c1 The cosine of the first rotation. + * @param s1 The sine of the first rotation. + * @param c2 The cosine of the second rotation. + * @param s2 The sine of the second rotation. + */ + template + void rot_fuse2x1(Vector x1, + Vector x2, + Vector x3, + TC c1, + TS s1, + TC c2, + TS s2) + { + assert(x1.size() == x2.size() && x2.size() == x3.size()); + const idx_t n = x1.size(); + for (idx_t i = 0; i < n; ++i) { + T x2_g1 = -conj(s1) * x1[i] + c1 * x2[i]; + x1[i] = c1 * x1[i] + s1 * x2[i]; + x2[i] = c2 * x2_g1 + s2 * x3[i]; + x3[i] = -conj(s2) * x2_g1 + c2 * x3[i]; + } + } + + /** Applies two plane rotations to three vectors. + * + * [x1] [c2 s2 0] [1 0 0 ] [x1] + * [x2] = [-s2 c2 0] * [0 c1 s1 ] * [x2] + * [x3] [0 0 1] [0 -s1 c1 ] [x3] + * + * @param x1 The first vector. + * @param x2 The second vector. + * @param x3 The third vector. + * @param c1 The cosine of the first rotation. + * @param s1 The sine of the first rotation. + * @param c2 The cosine of the second rotation. + * @param s2 The sine of the second rotation. + */ + template + void rot_fuse1x2(Vector x1, + Vector x2, + Vector x3, + TC c1, + TS s1, + TC c2, + TS s2) + { + assert(x1.size() == x2.size() && x2.size() == x3.size()); + const idx_t n = x1.size(); + for (idx_t i = 0; i < n; ++i) { + T x2_g1 = c1 * x2[i] + s1 * x3[i]; + x3[i] = -conj(s1) * x2[i] + c1 * x3[i]; + x2[i] = -conj(s2) * x1[i] + c2 * x2_g1; + x1[i] = c2 * x1[i] + s2 * x2_g1; + } + } + + /** Applies four plane rotations to four vectors. + * + * [x1] [1 0 0 0] [1 0 0 0 ] [c2 s2 0 0] [1 0 0 0] [x1] + * [x2] = [0 c4 s4 0] * [0 1 0 0 ] * [-s2 c2 0 0] * [0 c1 s1 0] * [x2] + * [x3] [0 -s4 c4 0] [0 0 c3 s3] [0 0 1 0] [0 -s1 c1 0] [x3] + * [x4] [0 0 0 1] [0 0 -s3 c3] [0 0 0 1] [0 0 0 0] [x4] + * + * Note: the rotations are applied in the order G1, G2, G3, G4, + * but the order G1, G3, G2, G4 is also possible. + * + * @param x1 The first vector. + * @param x2 The second vector. + * @param x3 The third vector. + * @param x4 The fourth vector. + * @param c1 The cosine of the first rotation. + * @param s1 The sine of the first rotation. + * @param c2 The cosine of the second rotation. + * @param s2 The sine of the second rotation. + * @param c3 The cosine of the third rotation. + * @param s3 The sine of the third rotation. + * @param c4 The cosine of the fourth rotation. + * @param s4 The sine of the fourth rotation. + */ + template + void rot_fuse2x2(Vector x1, + Vector x2, + Vector x3, + Vector x4, + TC c1, + TS s1, + TC c2, + TS s2, + TC c3, + TS s3, + TC c4, + TS s4) + { + assert(x1.size() == x2.size() && x1.size() == x3.size() && + x1.size() == x4.size()); + const idx_t n = x1.size(); + for (idx_t i = 0; i < n; ++i) { + T x2_g1 = c1 * x2[i] + s1 * x3[i]; + T x3_g1 = -conj(s1) * x2[i] + c1 * x3[i]; + T x2_g2 = -conj(s2) * x1[i] + c2 * x2_g1; + x1[i] = c2 * x1[i] + s2 * x2_g1; + T x3_g3 = c3 * x3_g1 + s3 * x4[i]; + x4[i] = -conj(s3) * x3_g1 + c3 * x4[i]; + x2[i] = c4 * x2_g2 + s4 * x3_g3; + x3[i] = -conj(s4) * x2_g2 + c4 * x3_g3; + } + } + + // Kernel for lasr3, forward left variant + template + void lasr3_kernel_forward_left(ConstMatrix C, + ConstMatrix S, + Matrix A, + MemoryBlock work) + { + idx_t m = A.num_rows(); + idx_t n = A.num_columns(); + idx_t k = C.num_columns(); + + assert(C.num_rows() == S.num_rows()); + assert(C.num_columns() == S.num_columns()); + assert(m == C.num_rows() + 1); + assert(m > k + 1); + + // Number of rows that will be stored in the packed workspace matrix + const idx_t np = k + 2; + // During the algorithm, instead of applying the rotations directly to + // the matrix A, we will apply them to a packed version of A. This is + // done to allow for efficient vectorization of the rotations. The + // packed matrix is stored in the workspace array. Since we apply the + // rotations from the left, this packed matrix is always row-major + // regardless of the layout of A. + Matrix A_pack(np, n, work); + // A_pack will store np rows of A. + // To avoid having to shuffle rows in A, we keep track of two indices + // For i = i_pack2, ..., i_pack2 + np - 1, row (i_pack + i - i_pack2 + + // np) % np of A_pack corresponds to row i of A. + idx_t i_pack = 0; + idx_t i_pack2 = 0; + + // Copy the first np rows of A to A_pack + for (idx_t i = 0; i < np; ++i) { + for (idx_t j = 0; j < n; ++j) { + A_pack(i, j) = A(i, j); + } + } + + // Startup phase + for (idx_t j = 0; j + 1 < k; ++j) { + for (idx_t i = 0, g = j; i < j + 1; ++i, --g) { + rot(A_pack.row(g), A_pack.row(g + 1), C(g, i), S(g, i)); + } + } + + // Pipeline phase + for (idx_t j = k - 1; j + 1 < m - 1; j += 2) { + for (idx_t i = 0, g = j; i + 1 < k; i += 2, g -= 2) { + auto a1 = A_pack.row((g - 1 + i_pack - i_pack2 + np) % np); + auto a2 = A_pack.row((g + i_pack - i_pack2 + np) % np); + auto a3 = A_pack.row((g + 1 + i_pack - i_pack2 + np) % np); + auto a4 = A_pack.row((g + 2 + i_pack - i_pack2 + np) % np); + + rot_fuse2x2(a1, a2, a3, a4, C(g, i), S(g, i), C(g - 1, i + 1), + S(g - 1, i + 1), C(g + 1, i), S(g + 1, i), + C(g, i + 1), S(g, i + 1)); + } + if (k % 2 == 1) { + // k is odd, so there are two more rotations to apply + idx_t i = k - 1; + idx_t g = j - i; + + auto a1 = A_pack.row((g + i_pack - i_pack2 + np) % np); + auto a2 = A_pack.row((g + 1 + i_pack - i_pack2 + np) % np); + auto a3 = A_pack.row((g + 2 + i_pack - i_pack2 + np) % np); + + rot_fuse2x1(a1, a2, a3, C(g, i), S(g, i), C(g + 1, i), + S(g + 1, i)); + } + // rows i_pack and i_pack+1 of A_pack are finished, copy them back + // to A + for (idx_t i = 0; i < n; i++) { + A(i_pack2, i) = A_pack(i_pack, i); + A(i_pack2 + 1, i) = A_pack((i_pack + 1) % np, i); + } + // Pack next rows and update i_pack and i_pack2 + if (j + 4 < m) { + for (idx_t i = 0; i < n; i++) { + A_pack(i_pack, i) = A(j + 3, i); + A_pack((i_pack + 1) % np, i) = A(j + 4, i); + } + i_pack = (i_pack + 2) % np; + i_pack2 += 2; + } + else if (j + 3 < m) { + for (idx_t i = 0; i < n; i++) { + A_pack(i_pack, i) = A(j + 3, i); + } + i_pack = (i_pack + 1) % np; + i_pack2 += 1; + } + } + + // Shutdown phase + for (idx_t j = (m - k + 1) % 2; j < k; ++j) { + for (idx_t i = j, g = m - 2; i < k; ++i, --g) { + auto a1 = A_pack.row((g + i_pack - i_pack2 + np) % np); + auto a2 = A_pack.row((g + 1 + i_pack - i_pack2 + np) % np); + rot(a1, a2, C(g, i), S(g, i)); + } + } + + // Copy the last np rows of A_pack back to A + for (idx_t i = 0; i < std::min(np, m - i_pack2); ++i) { + for (idx_t j = 0; j < n; ++j) { + A(i_pack2 + i, j) = A_pack((i_pack + i) % np, j); + } + } + } + + // Kernel for lasr3, backward left variant + template + void lasr3_kernel_backward_left(ConstMatrix C, + ConstMatrix S, + Matrix A, + MemoryBlock work) + { + idx_t m = A.num_rows(); + idx_t n = A.num_columns(); + idx_t k = C.num_columns(); + + assert(C.num_rows() == S.num_rows()); + assert(C.num_columns() == S.num_columns()); + assert(m == C.num_rows() + 1); + assert(m > k + 1); + + // Number of rows that will be stored in the packed workspace matrix + const idx_t np = k + 2; + // During the algorithm, instead of applying the rotations directly to + // the matrix A, we will apply them to a packed version of A. This is + // done to allow for efficient vectorization of the rotations. The + // packed matrix is stored in the workspace array. Since we apply the + // rotations from the left, this packed matrix is always row-major + // regardless of the layout of A. + Matrix A_pack(np, n, work); + // A_pack will store np rows of A. + // To avoid having to shuffle rows in A, we keep track of two indices + // For i = i_pack2, ..., i_pack2 + np - 1, row (i_pack + i - i_pack2 + + // np) % np of A_pack corresponds to row i of A. + idx_t i_pack = 0; + idx_t i_pack2 = m - np; + + // Copy the last np rows of A to A_pack + for (idx_t i = 0; i < np; ++i) { + for (idx_t j = 0; j < n; ++j) { + A_pack(i, j) = A(m - np + i, j); + } + } + + // Startup phase + for (idx_t j = 0; j + 1 < k; ++j) { + for (idx_t i = 0, g = m - 2 - j; i < j + 1; ++i, ++g) { + rot(A_pack.row((g - i_pack2) % np), + A_pack.row((g + 1 - i_pack2) % np), C(g, i), S(g, i)); + } + } + + // Pipeline phase + for (idx_t j = k - 1; j + 1 < m - 1; j += 2) { + for (idx_t i = 0, g = m - 2 - j; i + 1 < k; i += 2, g += 2) { + auto a1 = A_pack.row((g - 1 + i_pack - i_pack2) % np); + auto a2 = A_pack.row((g + i_pack - i_pack2) % np); + auto a3 = A_pack.row((g + 1 + i_pack - i_pack2) % np); + auto a4 = A_pack.row((g + 2 + i_pack - i_pack2) % np); + + rot_fuse2x2(a1, a2, a3, a4, C(g, i), S(g, i), C(g - 1, i), + S(g - 1, i), C(g + 1, i + 1), S(g + 1, i + 1), + C(g, i + 1), S(g, i + 1)); + } + if (k % 2 == 1) { + // k is odd, so there are two more rotations to apply + idx_t i = k - 1; + idx_t g = m - 2 - j + i; + + auto a1 = A_pack.row((g - 1 + i_pack - i_pack2 + np) % np); + auto a2 = A_pack.row((g + i_pack - i_pack2 + np) % np); + auto a3 = A_pack.row((g + 1 + i_pack - i_pack2 + np) % np); + + rot_fuse1x2(a1, a2, a3, C(g, i), S(g, i), C(g - 1, i), + S(g - 1, i)); + } + // rows i_pack+np-2 and i_pack+np-1 are finished, copy them back to + // A + for (idx_t i = 0; i < n; i++) { + A(i_pack2 + np - 2, i) = A_pack((i_pack + np - 2) % np, i); + A(i_pack2 + np - 1, i) = A_pack((i_pack + np - 1) % np, i); + } + // Pack next rows and update i_pack and i_pack2 + if (i_pack2 > 1) { + for (idx_t i = 0; i < n; i++) { + A_pack((i_pack + np - 2) % np, i) = A(i_pack2 - 2, i); + A_pack((i_pack + np - 1) % np, i) = A(i_pack2 - 1, i); + } + i_pack = (i_pack + np - 2) % np; + i_pack2 -= 2; + } + else if (i_pack2 > 0) { + for (idx_t i = 0; i < n; i++) { + A_pack((i_pack + np - 1) % np, i) = A(i_pack2 - 1, i); + } + i_pack = (i_pack + np - 1) % np; + i_pack2 -= 1; + } + } + + // By now, i_pack2 should point to the start of the matrix. + assert(i_pack2 == 0); + + // Shutdown phase + for (idx_t j = (m - k + 1) % 2; j < k; ++j) { + for (idx_t i = j, g = 0; i < k; ++i, ++g) { + auto a1 = A_pack.row((g + i_pack - i_pack2 + np) % np); + auto a2 = A_pack.row((g + 1 + i_pack - i_pack2 + np) % np); + rot(a1, a2, C(g, i), S(g, i)); + } + } + + // Copy the first np rows of A_pack back to A + for (idx_t i = 0; i < np; ++i) { + for (idx_t j = 0; j < n; ++j) { + A(i, j) = A_pack((i_pack + i) % np, j); + } + } + } + + // Kernel for lasr3, forward right variant + template + void lasr3_kernel_forward_right(ConstMatrix C, + ConstMatrix S, + Matrix A, + MemoryBlock work) + { + idx_t m = A.num_rows(); + idx_t n = A.num_columns(); + idx_t k = C.num_columns(); + + assert(C.num_rows() == S.num_rows()); + assert(C.num_columns() == S.num_columns()); + assert(n == C.num_rows() + 1); + assert(n > k + 1); + + // Number of columns that will be stored in the packed workspace matrix + const idx_t np = k + 2; + // During the algorithm, instead of applying the rotations directly to + // the matrix A, we will apply them to a packed version of A. This is + // done to allow for efficient vectorization of the rotations. The + // packed matrix is stored in the workspace array. Since we apply the + // rotations from the right, this packed matrix is always column-major + // regardless of the layout of A. + Matrix A_pack(m, np, work); + // A_pack will store np columns of A. + // To avoid having to shuffle columns in A, we keep track of two indices + // For i = i_pack2, ..., i_pack2 + np - 1, column (i_pack + i - i_pack2 + // + np) % np of A_pack corresponds to column i of A. + idx_t i_pack = 0; + idx_t i_pack2 = 0; + + // Copy the first np columns of A to A_pack + for (idx_t j = 0; j < np; ++j) { + for (idx_t i = 0; i < m; ++i) { + A_pack(i, j) = A(i, j); + } + } + + // Startup phase + for (idx_t j = 0; j + 1 < k; ++j) { + for (idx_t i = 0, g = j; i < j + 1; ++i, --g) { + rot(A_pack.column(g), A_pack.column(g + 1), C(g, i), + conj(S(g, i))); + } + } + + // Pipeline phase + for (idx_t j = k - 1; j + 1 < n - 1; j += 2) { + for (idx_t i = 0, g = j; i + 1 < k; i += 2, g -= 2) { + auto a1 = A_pack.column((g - 1 + i_pack - i_pack2 + np) % np); + auto a2 = A_pack.column((g + i_pack - i_pack2 + np) % np); + auto a3 = A_pack.column((g + 1 + i_pack - i_pack2 + np) % np); + auto a4 = A_pack.column((g + 2 + i_pack - i_pack2 + np) % np); + + rot_fuse2x2(a1, a2, a3, a4, C(g, i), conj(S(g, i)), + C(g - 1, i + 1), conj(S(g - 1, i + 1)), C(g + 1, i), + conj(S(g + 1, i)), C(g, i + 1), conj(S(g, i + 1))); + } + if (k % 2 == 1) { + // k is odd, so there are two more rotations to apply + idx_t i = k - 1; + idx_t g = j - i; + + auto a1 = A_pack.column((g + i_pack - i_pack2 + np) % np); + auto a2 = A_pack.column((g + 1 + i_pack - i_pack2 + np) % np); + auto a3 = A_pack.column((g + 2 + i_pack - i_pack2 + np) % np); + + rot_fuse2x1(a1, a2, a3, C(g, i), conj(S(g, i)), C(g + 1, i), + conj(S(g + 1, i))); + } + // columns i_pack and i_pack+1 of A_pack are finished, copy them + // back to A + for (idx_t i = 0; i < m; i++) { + A(i, i_pack2) = A_pack(i, i_pack); + A(i, i_pack2 + 1) = A_pack(i, (i_pack + 1) % np); + } + // Pack next columns and update i_pack and i_pack2 + if (j + 4 < n) { + for (idx_t i = 0; i < m; i++) { + A_pack(i, i_pack) = A(i, j + 3); + A_pack(i, (i_pack + 1) % np) = A(i, j + 4); + } + i_pack = (i_pack + 2) % np; + i_pack2 += 2; + } + else if (j + 3 < n) { + for (idx_t i = 0; i < m; i++) { + A_pack(i, i_pack) = A(i, j + 3); + } + i_pack = (i_pack + 1) % np; + i_pack2 += 1; + } + } + + // Shutdown phase + for (idx_t j = (n - k + 1) % 2; j < k; ++j) { + for (idx_t i = j, g = n - 2; i < k; ++i, --g) { + auto a1 = A_pack.column((g + i_pack - i_pack2 + np) % np); + auto a2 = A_pack.column((g + 1 + i_pack - i_pack2 + np) % np); + rot(a1, a2, C(g, i), conj(S(g, i))); + } + } + + // Copy the last np columns of A_pack back to A + for (idx_t i = 0; i < std::min(np, n - i_pack2); ++i) { + for (idx_t j = 0; j < m; ++j) { + A(j, i_pack2 + i) = A_pack(j, (i_pack + i) % np); + } + } + } + + // Kernel for lasr3, backward right variant + template + void lasr3_kernel_backward_right(ConstMatrix C, + ConstMatrix S, + Matrix A, + MemoryBlock work) + { + idx_t m = A.num_rows(); + idx_t n = A.num_columns(); + idx_t k = C.num_columns(); + + assert(C.num_rows() == S.num_rows()); + assert(C.num_columns() == S.num_columns()); + assert(n == C.num_rows() + 1); + assert(n > k + 1); + + // Number of columns that will be stored in the packed workspace matrix + const idx_t np = k + 2; + // During the algorithm, instead of applying the rotations directly to + // the matrix A, we will apply them to a packed version of A. This is + // done to allow for efficient vectorization of the rotations. The + // packed matrix is stored in the workspace array. Since we apply the + // rotations from the right, this packed matrix is always column-major + // regardless of the layout of A. + Matrix A_pack(m, np, work); + // A_pack will store np columns of A. + // To avoid having to shuffle columns in A, we keep track of two indices + // For i = i_pack2, ..., i_pack2 + np - 1, column (i_pack + i - i_pack2 + // + np) % np of A_pack corresponds to column i of A. + idx_t i_pack = 0; + idx_t i_pack2 = n - np; + + // Copy the last np columns of A to A_pack + for (idx_t i = 0; i < m; ++i) { + for (idx_t j = 0; j < np; ++j) { + A_pack(i, j) = A(i, n - np + j); + } + } + + // Startup phase + for (idx_t j = 0; j + 1 < k; ++j) { + for (idx_t i = 0, g = n - 2 - j; i < j + 1; ++i, ++g) { + rot(A_pack.column((g - i_pack2) % np), + A_pack.column((g + 1 - i_pack2) % np), C(g, i), + conj(S(g, i))); + } + } + + // Pipeline phase + for (idx_t j = k - 1; j + 1 < n - 1; j += 2) { + for (idx_t i = 0, g = n - 2 - j; i + 1 < k; i += 2, g += 2) { + auto a1 = A_pack.column((g - 1 + i_pack - i_pack2) % np); + auto a2 = A_pack.column((g + i_pack - i_pack2) % np); + auto a3 = A_pack.column((g + 1 + i_pack - i_pack2) % np); + auto a4 = A_pack.column((g + 2 + i_pack - i_pack2) % np); + + rot_fuse2x2(a1, a2, a3, a4, C(g, i), conj(S(g, i)), C(g - 1, i), + conj(S(g - 1, i)), C(g + 1, i + 1), + conj(S(g + 1, i + 1)), C(g, i + 1), + conj(S(g, i + 1))); + } + if (k % 2 == 1) { + // k is odd, so there are two more rotations to apply + idx_t i = k - 1; + idx_t g = n - 2 - j + i; + + auto a1 = A_pack.column((g - 1 + i_pack - i_pack2 + np) % np); + auto a2 = A_pack.column((g + i_pack - i_pack2 + np) % np); + auto a3 = A_pack.column((g + 1 + i_pack - i_pack2 + np) % np); + + rot_fuse1x2(a1, a2, a3, C(g, i), conj(S(g, i)), C(g - 1, i), + conj(S(g - 1, i))); + } + // columns i_pack+np-2 and i_pack+np-1 are finished, copy them back + // to A + for (idx_t i = 0; i < m; i++) { + A(i, i_pack2 + np - 2) = A_pack(i, (i_pack + np - 2) % np); + A(i, i_pack2 + np - 1) = A_pack(i, (i_pack + np - 1) % np); + } + // Pack next rows and update i_pack and i_pack2 + if (i_pack2 > 1) { + for (idx_t i = 0; i < m; i++) { + A_pack(i, (i_pack + np - 2) % np) = A(i, i_pack2 - 2); + A_pack(i, (i_pack + np - 1) % np) = A(i, i_pack2 - 1); + } + i_pack = (i_pack + np - 2) % np; + i_pack2 -= 2; + } + else if (i_pack2 > 0) { + for (idx_t i = 0; i < m; i++) { + A_pack(i, (i_pack + np - 1) % np) = A(i, i_pack2 - 1); + } + i_pack = (i_pack + np - 1) % np; + i_pack2 -= 1; + } + } + + // By now, i_pack2 should point to the start of the matrix. + assert(i_pack2 == 0); + + // Shutdown phase + for (idx_t j = (n - k + 1) % 2; j < k; ++j) { + for (idx_t i = j, g = 0; i < k; ++i, ++g) { + auto a1 = A_pack.column((g + i_pack - i_pack2 + np) % np); + auto a2 = A_pack.column((g + 1 + i_pack - i_pack2 + np) % np); + rot(a1, a2, C(g, i), conj(S(g, i))); + } + } + + // Copy the first np columns of A_pack back to A + for (idx_t i = 0; i < m; ++i) { + for (idx_t j = 0; j < np; ++j) { + A(i, j) = A_pack(i, (i_pack + j) % np); + } + } + } +} // namespace internal + +/** + * @copydoc lasr3 + */ +template +void lasr3(Side side, + Direction direct, + ConstMatrix C, + ConstMatrix S, + Matrix A, + MemoryBlock work) +{ + const idx_t m = A.num_rows(); + const idx_t n = A.num_columns(); + const idx_t k = C.num_columns(); + const idx_t num_rot = C.num_rows(); + + assert(side == Side::Left || side == Side::Right); + assert(direct == Direction::Forward || direct == Direction::Backward); + assert(C.num_rows() == S.num_rows()); + assert(C.num_columns() == S.num_columns()); + assert(side == Side::Left ? m : n == C.num_rows() + 1); + assert(work.size() >= lasr3_workquery(side, direct, C, S, A)); + + // Blocksize for the number of rows/columns of A to be processed at once + const idx_t block_mn = 256; + // Blocksize for the number of rotation sequences to be processed at once + const idx_t block_k = 64; + + if (num_rot < 4) { + // If there are less than 4 rotations, we can't really pipeline, + // just use standard loops + if (side == Side::Left) { + if (direct == Direction::Forward) { + for (idx_t i = 0; i < k; ++i) + for (idx_t j = 0; j < num_rot; ++j) + rot(A.row(j), A.row(j + 1), C(j, i), S(j, i)); + } + else { + for (idx_t i = 0; i < k; ++i) + for (idx_t j = num_rot - 1; j >= 0; --j) + rot(A.row(j), A.row(j + 1), C(j, i), S(j, i)); + } + } + else { + if (direct == Direction::Forward) { + for (idx_t i = 0; i < k; ++i) + for (idx_t j = 0; j < num_rot; ++j) + rot(A.column(j), A.column(j + 1), C(j, i), + conj(S(j, i))); + } + else { + for (idx_t i = 0; i < k; ++i) + for (idx_t j = num_rot - 1; j >= 0; --j) + rot(A.column(j), A.column(j + 1), C(j, i), + conj(S(j, i))); + } + } + return; + } + + // We can only pipeline if num_rot > k + 2 + // And for good performance, we want an even larger ratio + const idx_t block_k2 = std::min(num_rot - 3, block_k); + + if (side == Side::Left and direct == Direction::Forward) { + for (idx_t n_b = 0; n_b < n; n_b += block_mn) { + auto A2 = A.submatrix(0, m, n_b, std::min(n, n_b + block_mn)); + for (idx_t k_b = 0; k_b < k; k_b += block_k2) { + auto C2 = C.submatrix(0, C.num_rows(), k_b, + std::min(k, k_b + block_k2)); + auto S2 = S.submatrix(0, S.num_rows(), k_b, + std::min(k, k_b + block_k2)); + internal::lasr3_kernel_forward_left(C2, S2, A2, work); + } + } + } + + if (side == Side::Left and direct == Direction::Backward) { + for (idx_t n_b = 0; n_b < n; n_b += block_mn) { + auto A2 = A.submatrix(0, m, n_b, std::min(n, n_b + block_mn)); + for (idx_t k_b = 0; k_b < k; k_b += block_k2) { + auto C2 = C.submatrix(0, C.num_rows(), k_b, + std::min(k, k_b + block_k2)); + auto S2 = S.submatrix(0, S.num_rows(), k_b, + std::min(k, k_b + block_k2)); + internal::lasr3_kernel_backward_left(C2, S2, A2, work); + } + } + } + + if (side == Side::Right and direct == Direction::Forward) { + for (idx_t m_b = 0; m_b < m; m_b += block_mn) { + auto A2 = A.submatrix(m_b, std::min(m, m_b + block_mn), 0, n); + for (idx_t k_b = 0; k_b < k; k_b += block_k2) { + auto C2 = C.submatrix(0, C.num_rows(), k_b, + std::min(k, k_b + block_k2)); + auto S2 = S.submatrix(0, S.num_rows(), k_b, + std::min(k, k_b + block_k2)); + internal::lasr3_kernel_forward_right(C2, S2, A2, work); + } + } + } + + if (side == Side::Right and direct == Direction::Backward) { + for (idx_t m_b = 0; m_b < m; m_b += block_mn) { + auto A2 = A.submatrix(m_b, std::min(m, m_b + block_mn), 0, n); + for (idx_t k_b = 0; k_b < k; k_b += block_k2) { + auto C2 = C.submatrix(0, C.num_rows(), k_b, + std::min(k, k_b + block_k2)); + auto S2 = S.submatrix(0, S.num_rows(), k_b, + std::min(k, k_b + block_k2)); + internal::lasr3_kernel_backward_right(C2, S2, A2, work); + } + } + } +} + +} // namespace lapack_cpp + +#endif // LAPACK_CPP_LASR3_HPP \ No newline at end of file diff --git a/LAPACK_CPP/include/lapack_cpp/utils.hpp b/LAPACK_CPP/include/lapack_cpp/utils.hpp new file mode 100644 index 000000000..8276c3e16 --- /dev/null +++ b/LAPACK_CPP/include/lapack_cpp/utils.hpp @@ -0,0 +1,87 @@ +#ifndef LAPACK_CPP_UTILS_HPP +#define LAPACK_CPP_UTILS_HPP + +#include +#include + +#include "lapack_cpp/base.hpp" + +namespace lapack_cpp { + +// Code for printing +template +void print(const Vector& v) +{ + std::cout << "(" << v.size() << ")[" << std::endl; + for (idx_t i = 0; i < v.size(); ++i) { + std::cout << v[i] << std::endl; + } + std::cout << "]" << std::endl; +} + +// Code for printing +template +void print(const Matrix& m) +{ + std::cout << "(" << m.num_rows() << "," << m.num_columns() << ")[" + << std::endl; + for (idx_t i = 0; i < m.num_rows(); ++i) { + std::cout << "["; + for (idx_t j = 0; j < m.num_columns() - 1; ++j) { + std::cout << m(i, j) << ","; + } + std::cout << m(i, m.num_columns() - 1) << "]" << std::endl; + } + std::cout << "]" << std::endl; +} + +// Code for printing +template +void print(const ConstMatrix& m) +{ + std::cout << "(" << m.num_rows() << "," << m.num_columns() << ")[" + << std::endl; + for (idx_t i = 0; i < m.num_rows(); ++i) { + std::cout << "["; + for (idx_t j = 0; j < m.num_columns() - 1; ++j) { + std::cout << m(i, j) << ","; + } + std::cout << m(i, m.num_columns() - 1) << "]" << std::endl; + } + std::cout << "]" << std::endl; +} + +// Initialize a matrix with random values +template +void randomize(Matrix& m) +{ + for (idx_t j = 0; j < m.num_columns(); ++j) { + for (idx_t i = 0; i < m.num_rows(); ++i) { + m(i, j) = (T)rand() / (T)RAND_MAX; + } + } +} + +template +void randomize(Matrix, layout, idx_t>& m) +{ + for (idx_t j = 0; j < m.num_columns(); ++j) { + for (idx_t i = 0; i < m.num_rows(); ++i) { + m(i, j) = std::complex((T)rand() / (T)RAND_MAX, + (T)rand() / (T)RAND_MAX); + } + } +} + +// Initialize a vector with random values +template +void randomize(Vector& v) +{ + for (idx_t i = 0; i < v.size(); ++i) { + v[i] = (T)rand() / (T)RAND_MAX; + } +} + +} // namespace lapack_cpp + +#endif \ No newline at end of file diff --git a/LAPACK_CPP/src/lapack_c/lartg_c.f90 b/LAPACK_CPP/src/lapack_c/lartg_c.f90 new file mode 100644 index 000000000..6e88b07f5 --- /dev/null +++ b/LAPACK_CPP/src/lapack_c/lartg_c.f90 @@ -0,0 +1,53 @@ +subroutine lapack_c_slartg(f, g, c, s, r) bind(c, name='lapack_c_slartg') + use iso_c_binding + implicit none + real(kind=c_float), intent(in), value :: f, g + real(kind=c_float), intent(out) :: c, s, r + + external slartg + + ! Forward call to fortran implementation + call slartg( f, g, c, s, r ) + +end subroutine lapack_c_slartg + +subroutine lapack_c_dlartg(f, g, c, s, r) bind(c, name='lapack_c_dlartg') + use iso_c_binding + implicit none + real(kind=c_double), intent(in), value :: f, g + real(kind=c_double), intent(out) :: c, s, r + + external dlartg + + ! Forward call to fortran implementation + call dlartg( f, g, c, s, r ) + +end subroutine lapack_c_dlartg + +subroutine lapack_c_clartg(f, g, c, s, r) bind(c, name='lapack_c_clartg') + use iso_c_binding + implicit none + complex(kind=c_float_complex), intent(in), value :: f, g + complex(kind=c_float_complex), intent(out) :: s, r + real(kind=c_float), intent(out) :: c + + external clartg + + ! Forward call to fortran implementation + call clartg( f, g, c, s, r ) + +end subroutine lapack_c_clartg + +subroutine lapack_c_zlartg(f, g, c, s, r) bind(c, name='lapack_c_zlartg') + use iso_c_binding + implicit none + complex(kind=c_double_complex), intent(in), value :: f, g + complex(kind=c_double_complex), intent(out) :: s, r + real(kind=c_double), intent(out) :: c + + external zlartg + + ! Forward call to fortran implementation + call zlartg( f, g, c, s, r ) + +end subroutine lapack_c_zlartg \ No newline at end of file diff --git a/LAPACK_CPP/src/lapack_c/lasr3_c.cpp b/LAPACK_CPP/src/lapack_c/lasr3_c.cpp new file mode 100644 index 000000000..344703e4e --- /dev/null +++ b/LAPACK_CPP/src/lapack_c/lasr3_c.cpp @@ -0,0 +1,206 @@ +#include "lapack_c/lasr3.h" +#include "lapack_cpp/base.hpp" +#include "lapack_cpp/lapack/lasr3.hpp" + +using namespace lapack_cpp; + +// Actual definitions +extern "C" { + +void slasr3(char side, + char direct, + lapack_idx m, + lapack_idx n, + lapack_idx k, + const float* C, + lapack_idx ldc, + const float* S, + lapack_idx lds, + float* A, + lapack_idx lda, + float* work, + lapack_idx lwork) +{ + Side side_ = char2side(side); + Direction direct_ = char2direct(direct); + const lapack_idx n_rot = side_ == Side::Left ? m - 1 : n - 1; + + ConstMatrix C_(n_rot, k, C, ldc); + ConstMatrix S_(n_rot, k, S, lds); + Matrix A_(m, n, A, lda); + + if (lwork == -1) { + lapack_idx lwork_ = lasr3_workquery(side_, direct_, C_, S_, A_); + *work = lwork_; + return; + } + + MemoryBlock work_(lwork, work); + lasr3(side_, direct_, C_, S_, A_, work_); +} + +void dlasr3(char side, + char direct, + lapack_idx m, + lapack_idx n, + lapack_idx k, + const double* C, + lapack_idx ldc, + const double* S, + lapack_idx lds, + double* A, + lapack_idx lda, + double* work, + lapack_idx lwork) +{ + Side side_ = char2side(side); + Direction direct_ = char2direct(direct); + const lapack_idx n_rot = side_ == Side::Left ? m - 1 : n - 1; + + ConstMatrix C_(n_rot, k, C, ldc); + ConstMatrix S_(n_rot, k, S, lds); + Matrix A_(m, n, A, lda); + + if (lwork == -1) { + lapack_idx lwork_ = lasr3_workquery(side_, direct_, C_, S_, A_); + *work = lwork_; + return; + } + + MemoryBlock work_(lwork, work); + lasr3(side_, direct_, C_, S_, A_, work_); +} + +void clasr3(char side, + char direct, + lapack_idx m, + lapack_idx n, + lapack_idx k, + const float* C, + lapack_idx ldc, + const lapack_float_complex* S, + lapack_idx lds, + lapack_float_complex* A, + lapack_idx lda, + lapack_float_complex* work, + lapack_idx lwork) +{ + Side side_ = char2side(side); + Direction direct_ = char2direct(direct); + const lapack_idx n_rot = side_ == Side::Left ? m - 1 : n - 1; + + ConstMatrix C_(n_rot, k, C, ldc); + ConstMatrix S_(n_rot, k, + S, lds); + Matrix A_(m, n, A, lda); + + if (lwork == -1) { + lapack_idx lwork_ = lasr3_workquery(side_, direct_, C_, S_, A_); + *work = lwork_; + return; + } + + MemoryBlock work_(lwork, work); + lasr3(side_, direct_, C_, S_, A_, work_); +} + +void zlasr3(char side, + char direct, + lapack_idx m, + lapack_idx n, + lapack_idx k, + const double* C, + lapack_idx ldc, + const lapack_double_complex* S, + lapack_idx lds, + lapack_double_complex* A, + lapack_idx lda, + lapack_double_complex* work, + lapack_idx lwork) +{ + Side side_ = char2side(side); + Direction direct_ = char2direct(direct); + const lapack_idx n_rot = side_ == Side::Left ? m - 1 : n - 1; + + ConstMatrix C_(n_rot, k, C, ldc); + ConstMatrix S_( + n_rot, k, S, lds); + Matrix A_(m, n, A, + lda); + + if (lwork == -1) { + lapack_idx lwork_ = lasr3_workquery(side_, direct_, C_, S_, A_); + *work = lwork_; + return; + } + + MemoryBlock work_(lwork, work); + lasr3(side_, direct_, C_, S_, A_, work_); +} + +void cslasr3(char side, + char direct, + lapack_idx m, + lapack_idx n, + lapack_idx k, + const float* C, + lapack_idx ldc, + const float* S, + lapack_idx lds, + lapack_float_complex* A, + lapack_idx lda, + lapack_float_complex* work, + lapack_idx lwork) +{ + Side side_ = char2side(side); + Direction direct_ = char2direct(direct); + const lapack_idx n_rot = side_ == Side::Left ? m - 1 : n - 1; + + ConstMatrix C_(n_rot, k, C, ldc); + ConstMatrix S_(n_rot, k, S, lds); + Matrix A_(m, n, A, lda); + + if (lwork == -1) { + lapack_idx lwork_ = lasr3_workquery(side_, direct_, C_, S_, A_); + *work = lwork_; + return; + } + + MemoryBlock work_(lwork, work); + lasr3(side_, direct_, C_, S_, A_, work_); +} + +void zdlasr3(char side, + char direct, + lapack_idx m, + lapack_idx n, + lapack_idx k, + const double* C, + lapack_idx ldc, + const double* S, + lapack_idx lds, + lapack_double_complex* A, + lapack_idx lda, + lapack_double_complex* work, + lapack_idx lwork) +{ + Side side_ = char2side(side); + Direction direct_ = char2direct(direct); + const lapack_idx n_rot = side_ == Side::Left ? m - 1 : n - 1; + + ConstMatrix C_(n_rot, k, C, ldc); + ConstMatrix S_(n_rot, k, S, lds); + Matrix A_(m, n, A, + lda); + + if (lwork == -1) { + lapack_idx lwork_ = lasr3_workquery(side_, direct_, C_, S_, A_); + *work = lwork_; + return; + } + + MemoryBlock work_(lwork, work); + lasr3(side_, direct_, C_, S_, A_, work_); +} + +} // extern "C" \ No newline at end of file diff --git a/LAPACK_CPP/src/lapack_c/rot_c.f90 b/LAPACK_CPP/src/lapack_c/rot_c.f90 new file mode 100644 index 000000000..6876c9036 --- /dev/null +++ b/LAPACK_CPP/src/lapack_c/rot_c.f90 @@ -0,0 +1,57 @@ +subroutine lapack_c_drot(n, dx ,incx, dy, incy, c, s) bind(c, name='lapack_c_drot') + use iso_c_binding + implicit none + integer(kind=c_int), value, intent(in) :: n, incx, incy + real(kind=c_double), value, intent(in) :: c, s + real(kind=c_double), dimension(*), intent(inout) :: dx, dy + + external drot + + ! Forward call to fortran implementation + call drot( n, dx ,incx, dy, incy, c, s ) + +end subroutine lapack_c_drot + +subroutine lapack_c_srot(n, dx ,incx, dy, incy, c, s) bind(c, name='lapack_c_srot') + use iso_c_binding + implicit none + integer(kind=c_int), value, intent(in) :: n, incx, incy + real(kind=c_float), value, intent(in) :: c, s + real(kind=c_float), dimension(*), intent(inout) :: dx, dy + + external srot + + ! Forward call to fortran implementation + call srot( n, dx ,incx, dy, incy, c, s ) + +end subroutine lapack_c_srot + +subroutine lapack_c_crot(n, dx ,incx, dy, incy, c, s) bind(c, name='lapack_c_crot') + use iso_c_binding + implicit none + integer(kind=c_int), value, intent(in) :: n, incx, incy + real(kind=c_float), value, intent(in) :: c + complex(kind=c_float_complex), value, intent(in) :: s + complex(kind=c_float_complex), dimension(*), intent(inout) :: dx, dy + + external crot + + ! Forward call to fortran implementation + call crot( n, dx ,incx, dy, incy, c, s ) + +end subroutine lapack_c_crot + +subroutine lapack_c_zrot(n, dx ,incx, dy, incy, c, s) bind(c, name='lapack_c_zrot') + use iso_c_binding + implicit none + integer(kind=c_int), value, intent(in) :: n, incx, incy + real(kind=c_double), value, intent(in) :: c + complex(kind=c_double_complex), value, intent(in) :: s + complex(kind=c_double_complex), dimension(*), intent(inout) :: dx, dy + + external zrot + + ! Forward call to fortran implementation + call zrot( n, dx ,incx, dy, incy, c, s ) + +end subroutine lapack_c_zrot \ No newline at end of file diff --git a/LAPACK_CPP/src/lapack_cpp/lartg_cpp.cpp b/LAPACK_CPP/src/lapack_cpp/lartg_cpp.cpp new file mode 100644 index 000000000..576beca4a --- /dev/null +++ b/LAPACK_CPP/src/lapack_cpp/lartg_cpp.cpp @@ -0,0 +1,43 @@ +#include +#include +#include + +#include "lapack_c.h" +#include "lapack_cpp/base.hpp" +#include "lapack_cpp/lapack/lartg.hpp" + +namespace lapack_cpp { + +template <> +void lartg(float f, float g, float& c, float& s, float& r) +{ + lapack_c_slartg(f, g, &c, &s, &r); +} + +template <> +void lartg(double f, double g, double& c, double& s, double& r) +{ + lapack_c_dlartg(f, g, &c, &s, &r); +} + +template <> +void lartg>(std::complex f, + std::complex g, + float& c, + std::complex& s, + std::complex& r) +{ + lapack_c_clartg(f, g, &c, &s, &r); +} + +template <> +void lartg>(std::complex f, + std::complex g, + double& c, + std::complex& s, + std::complex& r) +{ + lapack_c_zlartg(f, g, &c, &s, &r); +} + +} // namespace lapack_cpp \ No newline at end of file diff --git a/LAPACK_CPP/src/lapack_cpp/rot_cpp.cpp b/LAPACK_CPP/src/lapack_cpp/rot_cpp.cpp new file mode 100644 index 000000000..a50a35607 --- /dev/null +++ b/LAPACK_CPP/src/lapack_cpp/rot_cpp.cpp @@ -0,0 +1,67 @@ +#ifdef USE_FORTRAN_BLAS + #include + #include + #include + + #include "lapack_c.h" + #include "lapack_cpp/base.hpp" + #include "lapack_cpp/blas/rot.hpp" + +namespace lapack_cpp { + +/** + * Templated wrapper around c functions, will be instantiated for each type. + * + * @tparam T + * @param A + * @param B + * @param C + */ +template +inline void rot_c_wrapper(const Vector& x, + const Vector& y, + TC c, + TS s) +{ + assert(x.size() == y.size()); + + if constexpr (std::is_same::value) { + lapack_c_drot(x.size(), x.ptr(), x.stride(), y.ptr(), y.stride(), c, s); + } + else if constexpr (std::is_same::value) { + lapack_c_srot(x.size(), x.ptr(), x.stride(), y.ptr(), y.stride(), c, s); + } + else if constexpr (std::is_same>::value) { + lapack_c_crot(x.size(), x.ptr(), x.stride(), y.ptr(), y.stride(), c, s); + } + else if constexpr (std::is_same>::value) { + lapack_c_zrot(x.size(), x.ptr(), x.stride(), y.ptr(), y.stride(), c, s); + } + else { + assert(false); + } +} + + // We have a lot of types to instantiate for, so we use a macro to avoid + // repetition. + #define INSTANTIATE_ROT(T, TC, idx_t) \ + template <> \ + void rot(const Vector& x, const Vector& y, TC c, \ + T s) \ + { \ + rot_c_wrapper(x, y, c, s); \ + } + +INSTANTIATE_ROT(float, float, lapack_idx_t) +INSTANTIATE_ROT(double, double, lapack_idx_t) +INSTANTIATE_ROT(std::complex, float, lapack_idx_t) +INSTANTIATE_ROT(std::complex, double, lapack_idx_t) +INSTANTIATE_ROT(float, float, int) +INSTANTIATE_ROT(double, double, int) +INSTANTIATE_ROT(std::complex, float, int) +INSTANTIATE_ROT(std::complex, double, int) + + #undef INSTANTIATE_ROT + +} // namespace lapack_cpp +#endif // USE_FORTRAN_BLAS \ No newline at end of file From 1a95ff9b675b929c6e9ca4bd032f307ce3bb1c1d Mon Sep 17 00:00:00 2001 From: thijssteel Date: Thu, 29 Feb 2024 16:21:40 +0100 Subject: [PATCH 02/10] add fortran wrappers for lasr3 --- LAPACK_CPP/Makefile | 14 +- LAPACK_CPP/include/lapack_c/lasr3.h | 159 +++++----- LAPACK_CPP/src/lapack_c/lasr3_c.cpp | 379 ++++++++++++----------- LAPACK_CPP/src/lapack_fortran/clasr3.f90 | 32 ++ LAPACK_CPP/src/lapack_fortran/dlasr3.f90 | 30 ++ LAPACK_CPP/src/lapack_fortran/slasr3.f90 | 30 ++ LAPACK_CPP/src/lapack_fortran/zlasr3.f90 | 32 ++ 7 files changed, 410 insertions(+), 266 deletions(-) create mode 100644 LAPACK_CPP/src/lapack_fortran/clasr3.f90 create mode 100644 LAPACK_CPP/src/lapack_fortran/dlasr3.f90 create mode 100644 LAPACK_CPP/src/lapack_fortran/slasr3.f90 create mode 100644 LAPACK_CPP/src/lapack_fortran/zlasr3.f90 diff --git a/LAPACK_CPP/Makefile b/LAPACK_CPP/Makefile index 6d0d8eb0c..cd5adf0bf 100644 --- a/LAPACK_CPP/Makefile +++ b/LAPACK_CPP/Makefile @@ -14,11 +14,19 @@ HEADERS = $(wildcard lapack_c/include/*.h) \ $(wildcard lapack_cpp/include/*/*.hpp) \ $(wildcard lapack_cpp/include/*/*/*.hpp) -OBJFILES = src/lapack_c/rot_c.o \ +OBJFILES = src/lapack_cpp/rot_cpp.o \ + src/lapack_cpp/lartg_cpp.o \ + src/lapack_c/rot_c.o \ src/lapack_c/lartg_c.o \ src/lapack_c/lasr3_c.o \ - src/lapack_cpp/rot_cpp.o \ - src/lapack_cpp/lartg_cpp.o \ + src/lapack_fortran/slasr3.o \ + src/lapack_fortran/dlasr3.o \ + src/lapack_fortran/clasr3.o \ + src/lapack_fortran/zlasr3.o \ + +# Fortran files +src/lapack_fortran/%.o: src/lapack_fortran/%.f90 + $(FC) $(FFLAGS) -cpp -c -o $@ $< # C files diff --git a/LAPACK_CPP/include/lapack_c/lasr3.h b/LAPACK_CPP/include/lapack_c/lasr3.h index efd86a916..e86479730 100644 --- a/LAPACK_CPP/include/lapack_c/lasr3.h +++ b/LAPACK_CPP/include/lapack_c/lasr3.h @@ -4,91 +4,96 @@ #include "lapack_c/util.h" #ifdef __cplusplus -extern "C" { -#endif // __cplusplus +extern "C" +{ +#endif // __cplusplus -void lapack_c_slasr3(char side, - char direct, - lapack_idx m, - lapack_idx n, - lapack_idx k, - const float* C, - lapack_idx ldc, - const float* S, - lapack_idx lds, - float* A, - lapack_idx lda, - float* work, - lapack_idx lwork); + void lapack_c_slasr3(char side, + char direct, + lapack_idx m, + lapack_idx n, + lapack_idx k, + const float *C, + lapack_idx ldc, + const float *S, + lapack_idx lds, + float *A, + lapack_idx lda, + float *work, + lapack_idx lwork); -void lapack_c_dlasr3(char side, - char direct, - lapack_idx m, - lapack_idx n, - lapack_idx k, - const double* C, - lapack_idx ldc, - const double* S, - lapack_idx lds, - double* A, - lapack_idx lda, - double* work, - lapack_idx lwork); + void lapack_c_dlasr3(char side, + char direct, + lapack_idx m, + lapack_idx n, + lapack_idx k, + const double *C, + lapack_idx ldc, + const double *S, + lapack_idx lds, + double *A, + lapack_idx lda, + double *work, + lapack_idx lwork); -void lapack_c_clasr3(char side, - char direct, - lapack_idx m, - lapack_idx n, - lapack_idx k, - const float* C, - lapack_idx ldc, - const lapack_float_complex* S, - lapack_idx lds, - lapack_float_complex* A, - lapack_idx lda, - lapack_float_complex* work, - lapack_idx lwork); + void lapack_c_clasr3(char side, + char direct, + lapack_idx m, + lapack_idx n, + lapack_idx k, + const float *C, + lapack_idx ldc, + const lapack_float_complex *S, + lapack_idx lds, + lapack_float_complex *A, + lapack_idx lda, + lapack_float_complex *work, + lapack_idx lwork); -void lapack_c_sclasr3(char side, - char direct, - lapack_idx m, - lapack_idx n, - lapack_idx k, - const float* C, - lapack_idx ldc, - const float* S, - lapack_idx lds, - lapack_float_complex* A, - lapack_idx lda, - lapack_float_complex* work, - lapack_idx lwork); + void lapack_c_sclasr3(char side, + char direct, + lapack_idx m, + lapack_idx n, + lapack_idx k, + const float *C, + lapack_idx ldc, + const float *S, + lapack_idx lds, + lapack_float_complex *A, + lapack_idx lda, + lapack_float_complex *work, + lapack_idx lwork); -void lapack_c_zlasr3(lapack_idx m, - lapack_idx n, - lapack_idx k, - const double* C, - lapack_idx ldc, - const lapack_double_complex* S, - lapack_idx lds, - lapack_double_complex* A, - lapack_idx lda, - lapack_double_complex* work, - lapack_idx lwork); + void lapack_c_zlasr3(char side, + char direct, + lapack_idx m, + lapack_idx n, + lapack_idx k, + const double *C, + lapack_idx ldc, + const lapack_double_complex *S, + lapack_idx lds, + lapack_double_complex *A, + lapack_idx lda, + lapack_double_complex *work, + lapack_idx lwork); -void lapack_c_dzlasr3(lapack_idx m, - lapack_idx n, - lapack_idx k, - const double* C, - lapack_idx ldc, - const double* S, - lapack_idx lds, - lapack_double_complex* A, - lapack_idx lda, - lapack_double_complex* work, - lapack_idx lwork); + void lapack_c_dzlasr3(char side, + char direct, + lapack_idx m, + lapack_idx n, + lapack_idx k, + const double *C, + lapack_idx ldc, + const double *S, + lapack_idx lds, + lapack_double_complex *A, + lapack_idx lda, + lapack_double_complex *work, + lapack_idx lwork); #ifdef __cplusplus } -#endif // __cplusplus +#endif // __cplusplus #endif \ No newline at end of file diff --git a/LAPACK_CPP/src/lapack_c/lasr3_c.cpp b/LAPACK_CPP/src/lapack_c/lasr3_c.cpp index 344703e4e..8945d43ad 100644 --- a/LAPACK_CPP/src/lapack_c/lasr3_c.cpp +++ b/LAPACK_CPP/src/lapack_c/lasr3_c.cpp @@ -5,202 +5,209 @@ using namespace lapack_cpp; // Actual definitions -extern "C" { - -void slasr3(char side, - char direct, - lapack_idx m, - lapack_idx n, - lapack_idx k, - const float* C, - lapack_idx ldc, - const float* S, - lapack_idx lds, - float* A, - lapack_idx lda, - float* work, - lapack_idx lwork) +extern "C" { - Side side_ = char2side(side); - Direction direct_ = char2direct(direct); - const lapack_idx n_rot = side_ == Side::Left ? m - 1 : n - 1; - - ConstMatrix C_(n_rot, k, C, ldc); - ConstMatrix S_(n_rot, k, S, lds); - Matrix A_(m, n, A, lda); - - if (lwork == -1) { - lapack_idx lwork_ = lasr3_workquery(side_, direct_, C_, S_, A_); - *work = lwork_; - return; - } - MemoryBlock work_(lwork, work); - lasr3(side_, direct_, C_, S_, A_, work_); -} - -void dlasr3(char side, - char direct, - lapack_idx m, - lapack_idx n, - lapack_idx k, - const double* C, - lapack_idx ldc, - const double* S, - lapack_idx lds, - double* A, - lapack_idx lda, - double* work, - lapack_idx lwork) -{ - Side side_ = char2side(side); - Direction direct_ = char2direct(direct); - const lapack_idx n_rot = side_ == Side::Left ? m - 1 : n - 1; - - ConstMatrix C_(n_rot, k, C, ldc); - ConstMatrix S_(n_rot, k, S, lds); - Matrix A_(m, n, A, lda); - - if (lwork == -1) { - lapack_idx lwork_ = lasr3_workquery(side_, direct_, C_, S_, A_); - *work = lwork_; - return; + void lapack_c_slasr3(char side, + char direct, + lapack_idx m, + lapack_idx n, + lapack_idx k, + const float *C, + lapack_idx ldc, + const float *S, + lapack_idx lds, + float *A, + lapack_idx lda, + float *work, + lapack_idx lwork) + { + Side side_ = char2side(side); + Direction direct_ = char2direct(direct); + const lapack_idx n_rot = side_ == Side::Left ? m - 1 : n - 1; + + ConstMatrix C_(n_rot, k, C, ldc); + ConstMatrix S_(n_rot, k, S, lds); + Matrix A_(m, n, A, lda); + + if (lwork == -1) + { + lapack_idx lwork_ = lasr3_workquery(side_, direct_, C_, S_, A_); + *work = lwork_; + return; + } + + MemoryBlock work_(lwork, work); + lasr3(side_, direct_, C_, S_, A_, work_); } - MemoryBlock work_(lwork, work); - lasr3(side_, direct_, C_, S_, A_, work_); -} - -void clasr3(char side, - char direct, - lapack_idx m, - lapack_idx n, - lapack_idx k, - const float* C, - lapack_idx ldc, - const lapack_float_complex* S, - lapack_idx lds, - lapack_float_complex* A, - lapack_idx lda, - lapack_float_complex* work, - lapack_idx lwork) -{ - Side side_ = char2side(side); - Direction direct_ = char2direct(direct); - const lapack_idx n_rot = side_ == Side::Left ? m - 1 : n - 1; - - ConstMatrix C_(n_rot, k, C, ldc); - ConstMatrix S_(n_rot, k, - S, lds); - Matrix A_(m, n, A, lda); - - if (lwork == -1) { - lapack_idx lwork_ = lasr3_workquery(side_, direct_, C_, S_, A_); - *work = lwork_; - return; + void lapack_c_dlasr3(char side, + char direct, + lapack_idx m, + lapack_idx n, + lapack_idx k, + const double *C, + lapack_idx ldc, + const double *S, + lapack_idx lds, + double *A, + lapack_idx lda, + double *work, + lapack_idx lwork) + { + Side side_ = char2side(side); + Direction direct_ = char2direct(direct); + const lapack_idx n_rot = side_ == Side::Left ? m - 1 : n - 1; + + ConstMatrix C_(n_rot, k, C, ldc); + ConstMatrix S_(n_rot, k, S, lds); + Matrix A_(m, n, A, lda); + + if (lwork == -1) + { + lapack_idx lwork_ = lasr3_workquery(side_, direct_, C_, S_, A_); + *work = lwork_; + return; + } + + MemoryBlock work_(lwork, work); + lasr3(side_, direct_, C_, S_, A_, work_); } - MemoryBlock work_(lwork, work); - lasr3(side_, direct_, C_, S_, A_, work_); -} - -void zlasr3(char side, - char direct, - lapack_idx m, - lapack_idx n, - lapack_idx k, - const double* C, - lapack_idx ldc, - const lapack_double_complex* S, - lapack_idx lds, - lapack_double_complex* A, - lapack_idx lda, - lapack_double_complex* work, - lapack_idx lwork) -{ - Side side_ = char2side(side); - Direction direct_ = char2direct(direct); - const lapack_idx n_rot = side_ == Side::Left ? m - 1 : n - 1; - - ConstMatrix C_(n_rot, k, C, ldc); - ConstMatrix S_( - n_rot, k, S, lds); - Matrix A_(m, n, A, - lda); - - if (lwork == -1) { - lapack_idx lwork_ = lasr3_workquery(side_, direct_, C_, S_, A_); - *work = lwork_; - return; + void lapack_c_clasr3(char side, + char direct, + lapack_idx m, + lapack_idx n, + lapack_idx k, + const float *C, + lapack_idx ldc, + const lapack_float_complex *S, + lapack_idx lds, + lapack_float_complex *A, + lapack_idx lda, + lapack_float_complex *work, + lapack_idx lwork) + { + Side side_ = char2side(side); + Direction direct_ = char2direct(direct); + const lapack_idx n_rot = side_ == Side::Left ? m - 1 : n - 1; + + ConstMatrix C_(n_rot, k, C, ldc); + ConstMatrix S_(n_rot, k, + S, lds); + Matrix A_(m, n, A, lda); + + if (lwork == -1) + { + lapack_idx lwork_ = lasr3_workquery(side_, direct_, C_, S_, A_); + *work = lwork_; + return; + } + + MemoryBlock work_(lwork, work); + lasr3(side_, direct_, C_, S_, A_, work_); } - MemoryBlock work_(lwork, work); - lasr3(side_, direct_, C_, S_, A_, work_); -} - -void cslasr3(char side, - char direct, - lapack_idx m, - lapack_idx n, - lapack_idx k, - const float* C, - lapack_idx ldc, - const float* S, - lapack_idx lds, - lapack_float_complex* A, - lapack_idx lda, - lapack_float_complex* work, - lapack_idx lwork) -{ - Side side_ = char2side(side); - Direction direct_ = char2direct(direct); - const lapack_idx n_rot = side_ == Side::Left ? m - 1 : n - 1; - - ConstMatrix C_(n_rot, k, C, ldc); - ConstMatrix S_(n_rot, k, S, lds); - Matrix A_(m, n, A, lda); - - if (lwork == -1) { - lapack_idx lwork_ = lasr3_workquery(side_, direct_, C_, S_, A_); - *work = lwork_; - return; + void lapack_c_zlasr3(char side, + char direct, + lapack_idx m, + lapack_idx n, + lapack_idx k, + const double *C, + lapack_idx ldc, + const lapack_double_complex *S, + lapack_idx lds, + lapack_double_complex *A, + lapack_idx lda, + lapack_double_complex *work, + lapack_idx lwork) + { + Side side_ = char2side(side); + Direction direct_ = char2direct(direct); + const lapack_idx n_rot = side_ == Side::Left ? m - 1 : n - 1; + + ConstMatrix C_(n_rot, k, C, ldc); + ConstMatrix S_( + n_rot, k, S, lds); + Matrix A_(m, n, A, + lda); + + if (lwork == -1) + { + lapack_idx lwork_ = lasr3_workquery(side_, direct_, C_, S_, A_); + *work = lwork_; + return; + } + + MemoryBlock work_(lwork, work); + lasr3(side_, direct_, C_, S_, A_, work_); } - MemoryBlock work_(lwork, work); - lasr3(side_, direct_, C_, S_, A_, work_); -} - -void zdlasr3(char side, - char direct, - lapack_idx m, - lapack_idx n, - lapack_idx k, - const double* C, - lapack_idx ldc, - const double* S, - lapack_idx lds, - lapack_double_complex* A, - lapack_idx lda, - lapack_double_complex* work, - lapack_idx lwork) -{ - Side side_ = char2side(side); - Direction direct_ = char2direct(direct); - const lapack_idx n_rot = side_ == Side::Left ? m - 1 : n - 1; - - ConstMatrix C_(n_rot, k, C, ldc); - ConstMatrix S_(n_rot, k, S, lds); - Matrix A_(m, n, A, - lda); - - if (lwork == -1) { - lapack_idx lwork_ = lasr3_workquery(side_, direct_, C_, S_, A_); - *work = lwork_; - return; + void lapack_c_sclasr3(char side, + char direct, + lapack_idx m, + lapack_idx n, + lapack_idx k, + const float *C, + lapack_idx ldc, + const float *S, + lapack_idx lds, + lapack_float_complex *A, + lapack_idx lda, + lapack_float_complex *work, + lapack_idx lwork) + { + Side side_ = char2side(side); + Direction direct_ = char2direct(direct); + const lapack_idx n_rot = side_ == Side::Left ? m - 1 : n - 1; + + ConstMatrix C_(n_rot, k, C, ldc); + ConstMatrix S_(n_rot, k, S, lds); + Matrix A_(m, n, A, lda); + + if (lwork == -1) + { + lapack_idx lwork_ = lasr3_workquery(side_, direct_, C_, S_, A_); + *work = lwork_; + return; + } + + MemoryBlock work_(lwork, work); + lasr3(side_, direct_, C_, S_, A_, work_); } - MemoryBlock work_(lwork, work); - lasr3(side_, direct_, C_, S_, A_, work_); -} + void lapack_c_dzlasr3(char side, + char direct, + lapack_idx m, + lapack_idx n, + lapack_idx k, + const double *C, + lapack_idx ldc, + const double *S, + lapack_idx lds, + lapack_double_complex *A, + lapack_idx lda, + lapack_double_complex *work, + lapack_idx lwork) + { + Side side_ = char2side(side); + Direction direct_ = char2direct(direct); + const lapack_idx n_rot = side_ == Side::Left ? m - 1 : n - 1; + + ConstMatrix C_(n_rot, k, C, ldc); + ConstMatrix S_(n_rot, k, S, lds); + Matrix A_(m, n, A, + lda); + + if (lwork == -1) + { + lapack_idx lwork_ = lasr3_workquery(side_, direct_, C_, S_, A_); + *work = lwork_; + return; + } + + MemoryBlock work_(lwork, work); + lasr3(side_, direct_, C_, S_, A_, work_); + } -} // extern "C" \ No newline at end of file +} // extern "C" \ No newline at end of file diff --git a/LAPACK_CPP/src/lapack_fortran/clasr3.f90 b/LAPACK_CPP/src/lapack_fortran/clasr3.f90 new file mode 100644 index 000000000..b6b2957d0 --- /dev/null +++ b/LAPACK_CPP/src/lapack_fortran/clasr3.f90 @@ -0,0 +1,32 @@ +subroutine clasr3 ( side, direct, m, n, k, C, ldc, S, lds, A, lda, work, lwork ) + use iso_c_binding, only: c_char + implicit none + ! Interface to C + interface + subroutine lapack_c_clasr3(side_c, direct_c, m_c, n_c, k_c, C_c,& + ldc_c, S_c, lds_c, A_c, lda_c, work_c, lwork_c) bind(c, name="lapack_c_clasr3") + use iso_c_binding, only: c_char, c_float, c_float_complex, c_int + implicit none + integer(c_int), intent(in) :: m_c,n_c,k_c,ldc_c, lds_c, lda_c, lwork_c + character(c_char), intent(in) :: side_c, direct_c + real(c_float), intent(in) :: C_c(ldc_c,*) + complex(c_float_complex), intent(in) :: S_c(lds_c,*) + complex(c_float_complex), intent(inout) :: A_c(lda_c,*), work_c(*) + end subroutine lapack_c_clasr3 + end interface + + ! Arguments + integer, intent(in) :: m,n,k,ldc, lds, lda, lwork + character, intent(in) :: side, direct + real, intent(in) :: C(ldc,*) + complex, intent(in) :: S(lds,*) + complex, intent(inout) :: A(lda,*), work(*) + character(c_char) :: side_to_c, direct_to_c + + ! Convert characters to C characters + side_to_c = side + direct_to_c = direct + + ! Call C function + call lapack_c_clasr3(side_to_c, direct_to_c, m, n, k, C, ldc, S, lds, A, lda, work, lwork) +end subroutine clasr3 diff --git a/LAPACK_CPP/src/lapack_fortran/dlasr3.f90 b/LAPACK_CPP/src/lapack_fortran/dlasr3.f90 new file mode 100644 index 000000000..0a9392d7d --- /dev/null +++ b/LAPACK_CPP/src/lapack_fortran/dlasr3.f90 @@ -0,0 +1,30 @@ +subroutine dlasr3 ( side, direct, m, n, k, C, ldc, S, lds, A, lda, work, lwork ) + use iso_c_binding, only: c_char + implicit none + ! Interface to C + interface + subroutine lapack_c_dlasr3(side_c, direct_c, m_c, n_c, k_c, C_c,& + ldc_c, S_c, lds_c, A_c, lda_c, work_c, lwork_c) bind(c, name="lapack_c_dlasr3") + use iso_c_binding, only: c_char, c_double, c_int + implicit none + integer(c_int), intent(in) :: m_c,n_c,k_c,ldc_c, lds_c, lda_c, lwork_c + character(c_char), intent(in) :: side_c, direct_c + real(c_double), intent(in) :: C_c(ldc_c,*),S_c(lds_c,*) + real(c_double), intent(inout) :: A_c(lda_c,*), work_c(*) + end subroutine lapack_c_dlasr3 + end interface + + ! Arguments + integer, intent(in) :: m,n,k,ldc, lds, lda, lwork + character, intent(in) :: side, direct + double precision, intent(in) :: C(ldc,*),S(lds,*) + double precision, intent(inout) :: A(lda,*), work(*) + character(c_char) :: side_to_c, direct_to_c + + ! Convert characters to C characters + side_to_c = side + direct_to_c = direct + + ! Call C function + call lapack_c_dlasr3(side_to_c, direct_to_c, m, n, k, C, ldc, S, lds, A, lda, work, lwork) +end subroutine dlasr3 diff --git a/LAPACK_CPP/src/lapack_fortran/slasr3.f90 b/LAPACK_CPP/src/lapack_fortran/slasr3.f90 new file mode 100644 index 000000000..91ea69c63 --- /dev/null +++ b/LAPACK_CPP/src/lapack_fortran/slasr3.f90 @@ -0,0 +1,30 @@ +subroutine slasr3 ( side, direct, m, n, k, C, ldc, S, lds, A, lda, work, lwork ) + use iso_c_binding, only: c_char + implicit none + ! Interface to C + interface + subroutine lapack_c_slasr3(side_c, direct_c, m_c, n_c, k_c, C_c,& + ldc_c, S_c, lds_c, A_c, lda_c, work_c, lwork_c) bind(c, name="lapack_c_slasr3") + use iso_c_binding, only: c_char, c_float, c_int + implicit none + integer(c_int), intent(in) :: m_c,n_c,k_c,ldc_c, lds_c, lda_c, lwork_c + character(c_char), intent(in) :: side_c, direct_c + real(c_float), intent(in) :: C_c(ldc_c,*),S_c(lds_c,*) + real(c_float), intent(inout) :: A_c(lda_c,*), work_c(*) + end subroutine lapack_c_slasr3 + end interface + + ! Arguments + integer, intent(in) :: m,n,k,ldc, lds, lda, lwork + character, intent(in) :: side, direct + real, intent(in) :: C(ldc,*),S(lds,*) + real, intent(inout) :: A(lda,*), work(*) + character(c_char) :: side_to_c, direct_to_c + + ! Convert characters to C characters + side_to_c = side + direct_to_c = direct + + ! Call C function + call lapack_c_slasr3(side_to_c, direct_to_c, m, n, k, C, ldc, S, lds, A, lda, work, lwork) +end subroutine slasr3 diff --git a/LAPACK_CPP/src/lapack_fortran/zlasr3.f90 b/LAPACK_CPP/src/lapack_fortran/zlasr3.f90 new file mode 100644 index 000000000..ad0e6c662 --- /dev/null +++ b/LAPACK_CPP/src/lapack_fortran/zlasr3.f90 @@ -0,0 +1,32 @@ +subroutine zlasr3 ( side, direct, m, n, k, C, ldc, S, lds, A, lda, work, lwork ) + use iso_c_binding, only: c_char + implicit none + ! Interface to C + interface + subroutine lapack_c_zlasr3(side_c, direct_c, m_c, n_c, k_c, C_c,& + ldc_c, S_c, lds_c, A_c, lda_c, work_c, lwork_c) bind(c, name="lapack_c_zlasr3") + use iso_c_binding, only: c_char, c_double, c_double_complex, c_int + implicit none + integer(c_int), intent(in) :: m_c,n_c,k_c,ldc_c, lds_c, lda_c, lwork_c + character(c_char), intent(in) :: side_c, direct_c + real(c_double), intent(in) :: C_c(ldc_c,*) + complex(c_double_complex), intent(in) :: S_c(lds_c,*) + complex(c_double_complex), intent(inout) :: A_c(lda_c,*), work_c(*) + end subroutine lapack_c_zlasr3 + end interface + + ! Arguments + integer, intent(in) :: m,n,k,ldc, lds, lda, lwork + character, intent(in) :: side, direct + double precision, intent(in) :: C(ldc,*) + double complex, intent(in) :: S(lds,*) + double complex, intent(inout) :: A(lda,*), work(*) + character(c_char) :: side_to_c, direct_to_c + + ! Convert characters to C characters + side_to_c = side + direct_to_c = direct + + ! Call C function + call lapack_c_zlasr3(side_to_c, direct_to_c, m, n, k, C, ldc, S, lds, A, lda, work, lwork) +end subroutine zlasr3 From 2bdd724ecf85630cea3db91adc910f01d3701b02 Mon Sep 17 00:00:00 2001 From: thijssteel Date: Fri, 1 Mar 2024 18:10:08 +0100 Subject: [PATCH 03/10] small change in template of lasr3 --- LAPACK_CPP/Makefile | 14 ++--- LAPACK_CPP/example/profile_lasr3.cpp | 26 ++++++--- .../include/lapack_cpp/lapack/lasr3.hpp | 54 +++++++++---------- 3 files changed, 51 insertions(+), 43 deletions(-) diff --git a/LAPACK_CPP/Makefile b/LAPACK_CPP/Makefile index cd5adf0bf..ec6b653a4 100644 --- a/LAPACK_CPP/Makefile +++ b/LAPACK_CPP/Makefile @@ -7,12 +7,12 @@ FFLAGS=-Wall -Wextra -pedantic -g all: ./example/test_lasr3 ./example/profile_lasr3 -HEADERS = $(wildcard lapack_c/include/*.h) \ - $(wildcard lapack_c/include/*/*.h) \ - $(wildcard lapack_c/include/*/*/*.h) \ - $(wildcard lapack_cpp/include/*.hpp) \ - $(wildcard lapack_cpp/include/*/*.hpp) \ - $(wildcard lapack_cpp/include/*/*/*.hpp) +HEADERS = $(wildcard include/lapack_c/*.h) \ + $(wildcard include/lapack_c/*/*.h) \ + $(wildcard include/lapack_c/*/*/*.h) \ + $(wildcard include/lapack_cpp/*.hpp) \ + $(wildcard include/lapack_cpp/*/*.hpp) \ + $(wildcard include/lapack_cpp/*/*/*.hpp) OBJFILES = src/lapack_cpp/rot_cpp.o \ src/lapack_cpp/lartg_cpp.o \ @@ -30,7 +30,7 @@ src/lapack_fortran/%.o: src/lapack_fortran/%.f90 # C files -src/lapack_c/%.o: src/lapack_c/%.cpp +src/lapack_c/%.o: src/lapack_c/%.cpp $(HEADERS) $(CXX) $(CXXFLAGS) -I./include -c -o $@ $< src/lapack_c/%.o: src/lapack_c/%.f90 diff --git a/LAPACK_CPP/example/profile_lasr3.cpp b/LAPACK_CPP/example/profile_lasr3.cpp index 456af8df8..2e79a99de 100644 --- a/LAPACK_CPP/example/profile_lasr3.cpp +++ b/LAPACK_CPP/example/profile_lasr3.cpp @@ -1,5 +1,5 @@ -#include // for high_resolution_clock +#include // for high_resolution_clock #include #include @@ -28,8 +28,10 @@ void profile_lasr3(idx_t m, idx_t n, idx_t k, Side side, Direction direction) // Generate random, but valid rotations randomize(C); randomize(S); - for (idx_t i = 0; i < n_rot; ++i) { - for (idx_t j = 0; j < k; ++j) { + for (idx_t i = 0; i < n_rot; ++i) + { + for (idx_t j = 0; j < k; ++j) + { T f = C(i, j); T g = S(i, j); T r; @@ -49,7 +51,8 @@ void profile_lasr3(idx_t m, idx_t n, idx_t k, Side side, Direction direction) const idx_t n_warmup = 20; std::vector timings(n_timings); - for (idx_t i = 0; i < n_timings; ++i) { + for (idx_t i = 0; i < n_timings; ++i) + { A = A_copy; auto start = std::chrono::high_resolution_clock::now(); lasr3(side, direction, C.as_const(), S.as_const(), A, work); @@ -62,6 +65,11 @@ void profile_lasr3(idx_t m, idx_t n, idx_t k, Side side, Direction direction) mean += timings[i]; mean /= n_timings - n_warmup; + float std_dev = 0; + for (idx_t i = n_warmup; i < n_timings; ++i) + std_dev += (timings[i] - mean) * (timings[i] - mean); + std_dev = std::sqrt(std_dev / (n_timings - n_warmup - 1)); + long nflops = side == Side::Left ? 6. * (m - 1) * n * k : 6. * (n - 1) * m * k; @@ -69,6 +77,7 @@ void profile_lasr3(idx_t m, idx_t n, idx_t k, Side side, Direction direction) << ", side = " << (char)side << ", direction = " << (char)direction << ", mean time = " << mean << " s" + << ", std dev = " << std_dev / mean * 100 << " %" << ", flop rate = " << nflops / mean * 1.0e-9 << " GFlops" << std::endl; } @@ -81,12 +90,15 @@ int main() const idx_t nb = 1000; const idx_t k = 64; - for (int s = 0; s < 2; ++s) { + for (int s = 0; s < 2; ++s) + { Side side = s == 0 ? Side::Left : Side::Right; - for (int d = 0; d < 2; ++d) { + for (int d = 0; d < 2; ++d) + { Direction direction = d == 0 ? Direction::Forward : Direction::Backward; - for (idx_t n_rot = 99; n_rot <= 1600; n_rot += 100) { + for (idx_t n_rot = 99; n_rot <= 1600; n_rot += 100) + { idx_t m = side == Side::Left ? n_rot + 1 : nb; idx_t n = side == Side::Left ? nb : n_rot + 1; diff --git a/LAPACK_CPP/include/lapack_cpp/lapack/lasr3.hpp b/LAPACK_CPP/include/lapack_cpp/lapack/lasr3.hpp index 0a3ffbb8a..535ca4ce6 100644 --- a/LAPACK_CPP/include/lapack_cpp/lapack/lasr3.hpp +++ b/LAPACK_CPP/include/lapack_cpp/lapack/lasr3.hpp @@ -33,7 +33,7 @@ namespace lapack_cpp { * * @tparam T The type of the elements of the matrix A. * - * @tparam TC The type of the elements of the matrix C. + * @tparam real_t The type of the elements of the matrix C. * * @tparam TS The type of the elements of the matrix S. * @@ -41,14 +41,14 @@ namespace lapack_cpp { * * @tparam idx_t The type of the indices. * - * @note if either TC or TS is a complex type, then T must also be a complex + * @note if either real_t or TS is a complex type, then T must also be a complex * type. * */ -template +template void lasr3(Side side, Direction direct, - ConstMatrix C, + ConstMatrix, layout, idx_t> C, ConstMatrix S, Matrix A, MemoryBlock work); @@ -56,10 +56,10 @@ void lasr3(Side side, /** * Workspace query for lasr3. */ -template +template idx_t lasr3_workquery(Side side, Direction direct, - ConstMatrix C, + ConstMatrix, layout, idx_t> C, ConstMatrix S, Matrix A) { @@ -89,10 +89,10 @@ idx_t lasr3_workquery(Side side, /** * @copydoc lasr3 */ -template +template void lasr3(Side side, Direction direct, - ConstMatrix C, + ConstMatrix, layout, idx_t> C, ConstMatrix S, Matrix A) { @@ -117,13 +117,13 @@ namespace internal { * @param c2 The cosine of the second rotation. * @param s2 The sine of the second rotation. */ - template + template void rot_fuse2x1(Vector x1, Vector x2, Vector x3, - TC c1, + real_t c1, TS s1, - TC c2, + real_t c2, TS s2) { assert(x1.size() == x2.size() && x2.size() == x3.size()); @@ -150,13 +150,13 @@ namespace internal { * @param c2 The cosine of the second rotation. * @param s2 The sine of the second rotation. */ - template + template void rot_fuse1x2(Vector x1, Vector x2, Vector x3, - TC c1, + real_t c1, TS s1, - TC c2, + real_t c2, TS s2) { assert(x1.size() == x2.size() && x2.size() == x3.size()); @@ -192,18 +192,18 @@ namespace internal { * @param c4 The cosine of the fourth rotation. * @param s4 The sine of the fourth rotation. */ - template + template void rot_fuse2x2(Vector x1, Vector x2, Vector x3, Vector x4, - TC c1, + real_t c1, TS s1, - TC c2, + real_t c2, TS s2, - TC c3, + real_t c3, TS s3, - TC c4, + real_t c4, TS s4) { assert(x1.size() == x2.size() && x1.size() == x3.size() && @@ -223,11 +223,10 @@ namespace internal { // Kernel for lasr3, forward left variant template - void lasr3_kernel_forward_left(ConstMatrix C, + void lasr3_kernel_forward_left(ConstMatrix, layout, idx_t> C, ConstMatrix S, Matrix A, MemoryBlock work) @@ -338,11 +337,10 @@ namespace internal { // Kernel for lasr3, backward left variant template - void lasr3_kernel_backward_left(ConstMatrix C, + void lasr3_kernel_backward_left(ConstMatrix, layout, idx_t> C, ConstMatrix S, Matrix A, MemoryBlock work) @@ -457,11 +455,10 @@ namespace internal { // Kernel for lasr3, forward right variant template - void lasr3_kernel_forward_right(ConstMatrix C, + void lasr3_kernel_forward_right(ConstMatrix, layout, idx_t> C, ConstMatrix S, Matrix A, MemoryBlock work) @@ -573,11 +570,10 @@ namespace internal { // Kernel for lasr3, backward right variant template - void lasr3_kernel_backward_right(ConstMatrix C, + void lasr3_kernel_backward_right(ConstMatrix, layout, idx_t> C, ConstMatrix S, Matrix A, MemoryBlock work) @@ -696,10 +692,10 @@ namespace internal { /** * @copydoc lasr3 */ -template +template void lasr3(Side side, Direction direct, - ConstMatrix C, + ConstMatrix, layout, idx_t> C, ConstMatrix S, Matrix A, MemoryBlock work) From f57a79230497f03e2b7b0fd58f01a3b21e2040cf Mon Sep 17 00:00:00 2001 From: thijssteel Date: Fri, 1 Mar 2024 18:10:27 +0100 Subject: [PATCH 04/10] use direct overloading for rot --- LAPACK_CPP/src/lapack_cpp/rot_cpp.cpp | 118 ++++++++++++++++---------- 1 file changed, 72 insertions(+), 46 deletions(-) diff --git a/LAPACK_CPP/src/lapack_cpp/rot_cpp.cpp b/LAPACK_CPP/src/lapack_cpp/rot_cpp.cpp index a50a35607..ce8609ca0 100644 --- a/LAPACK_CPP/src/lapack_cpp/rot_cpp.cpp +++ b/LAPACK_CPP/src/lapack_cpp/rot_cpp.cpp @@ -9,59 +9,85 @@ namespace lapack_cpp { -/** - * Templated wrapper around c functions, will be instantiated for each type. - * - * @tparam T - * @param A - * @param B - * @param C - */ -template -inline void rot_c_wrapper(const Vector& x, - const Vector& y, - TC c, - TS s) +template<> +void rot(const Vector& x, + const Vector& y, + float c, + float s) { assert(x.size() == y.size()); + lapack_c_srot(x.size(), x.ptr(), x.stride(), y.ptr(), y.stride(), c, s); +} + +template<> +void rot(const Vector& x, + const Vector& y, + double c, + double s) +{ + assert(x.size() == y.size()); + lapack_c_drot(x.size(), x.ptr(), x.stride(), y.ptr(), y.stride(), c, s); +} + +template<> +void rot(const Vector, lapack_idx_t>& x, + const Vector, lapack_idx_t>& y, + float c, + std::complex s) +{ + assert(x.size() == y.size()); + lapack_c_crot(x.size(), x.ptr(), x.stride(), y.ptr(), y.stride(), c, s); +} + +template<> +void rot(const Vector, lapack_idx_t>& x, + const Vector, lapack_idx_t>& y, + double c, + std::complex s) +{ + assert(x.size() == y.size()); + lapack_c_zrot(x.size(), x.ptr(), x.stride(), y.ptr(), y.stride(), c, s); +} - if constexpr (std::is_same::value) { - lapack_c_drot(x.size(), x.ptr(), x.stride(), y.ptr(), y.stride(), c, s); - } - else if constexpr (std::is_same::value) { - lapack_c_srot(x.size(), x.ptr(), x.stride(), y.ptr(), y.stride(), c, s); - } - else if constexpr (std::is_same>::value) { - lapack_c_crot(x.size(), x.ptr(), x.stride(), y.ptr(), y.stride(), c, s); - } - else if constexpr (std::is_same>::value) { - lapack_c_zrot(x.size(), x.ptr(), x.stride(), y.ptr(), y.stride(), c, s); - } - else { - assert(false); - } +template<> +void rot(const Vector& x, + const Vector& y, + float c, + float s) +{ + assert(x.size() == y.size()); + lapack_c_srot(x.size(), x.ptr(), x.stride(), y.ptr(), y.stride(), c, s); } - // We have a lot of types to instantiate for, so we use a macro to avoid - // repetition. - #define INSTANTIATE_ROT(T, TC, idx_t) \ - template <> \ - void rot(const Vector& x, const Vector& y, TC c, \ - T s) \ - { \ - rot_c_wrapper(x, y, c, s); \ - } +template<> +void rot(const Vector& x, + const Vector& y, + double c, + double s) +{ + assert(x.size() == y.size()); + lapack_c_drot(x.size(), x.ptr(), x.stride(), y.ptr(), y.stride(), c, s); +} -INSTANTIATE_ROT(float, float, lapack_idx_t) -INSTANTIATE_ROT(double, double, lapack_idx_t) -INSTANTIATE_ROT(std::complex, float, lapack_idx_t) -INSTANTIATE_ROT(std::complex, double, lapack_idx_t) -INSTANTIATE_ROT(float, float, int) -INSTANTIATE_ROT(double, double, int) -INSTANTIATE_ROT(std::complex, float, int) -INSTANTIATE_ROT(std::complex, double, int) +template<> +void rot(const Vector, int>& x, + const Vector, int>& y, + float c, + std::complex s) +{ + assert(x.size() == y.size()); + lapack_c_crot(x.size(), x.ptr(), x.stride(), y.ptr(), y.stride(), c, s); +} - #undef INSTANTIATE_ROT +template<> +void rot(const Vector, int>& x, + const Vector, int>& y, + double c, + std::complex s) +{ + assert(x.size() == y.size()); + lapack_c_zrot(x.size(), x.ptr(), x.stride(), y.ptr(), y.stride(), c, s); +} } // namespace lapack_cpp #endif // USE_FORTRAN_BLAS \ No newline at end of file From 81be118bcb799b921c7dc4270e184ef4d321bf20 Mon Sep 17 00:00:00 2001 From: thijssteel Date: Mon, 4 Mar 2024 09:41:25 +0100 Subject: [PATCH 05/10] change filenames --- LAPACK_CPP/Makefile | 10 +++++----- LAPACK_CPP/include/lapack_cpp/base/enums.hpp | 5 +++++ LAPACK_CPP/src/lapack_c/{lartg_c.f90 => lartg.f90} | 0 LAPACK_CPP/src/lapack_c/{lasr3_c.cpp => lasr3.cpp} | 0 LAPACK_CPP/src/lapack_c/{rot_c.f90 => rot.f90} | 0 LAPACK_CPP/src/lapack_cpp/{lartg_cpp.cpp => lartg.cpp} | 0 LAPACK_CPP/src/lapack_cpp/{rot_cpp.cpp => rot.cpp} | 0 7 files changed, 10 insertions(+), 5 deletions(-) rename LAPACK_CPP/src/lapack_c/{lartg_c.f90 => lartg.f90} (100%) rename LAPACK_CPP/src/lapack_c/{lasr3_c.cpp => lasr3.cpp} (100%) rename LAPACK_CPP/src/lapack_c/{rot_c.f90 => rot.f90} (100%) rename LAPACK_CPP/src/lapack_cpp/{lartg_cpp.cpp => lartg.cpp} (100%) rename LAPACK_CPP/src/lapack_cpp/{rot_cpp.cpp => rot.cpp} (100%) diff --git a/LAPACK_CPP/Makefile b/LAPACK_CPP/Makefile index ec6b653a4..46939582e 100644 --- a/LAPACK_CPP/Makefile +++ b/LAPACK_CPP/Makefile @@ -14,11 +14,11 @@ HEADERS = $(wildcard include/lapack_c/*.h) \ $(wildcard include/lapack_cpp/*/*.hpp) \ $(wildcard include/lapack_cpp/*/*/*.hpp) -OBJFILES = src/lapack_cpp/rot_cpp.o \ - src/lapack_cpp/lartg_cpp.o \ - src/lapack_c/rot_c.o \ - src/lapack_c/lartg_c.o \ - src/lapack_c/lasr3_c.o \ +OBJFILES = src/lapack_cpp/rot.o \ + src/lapack_cpp/lartg.o \ + src/lapack_c/rot.o \ + src/lapack_c/lartg.o \ + src/lapack_c/lasr3.o \ src/lapack_fortran/slasr3.o \ src/lapack_fortran/dlasr3.o \ src/lapack_fortran/clasr3.o \ diff --git a/LAPACK_CPP/include/lapack_cpp/base/enums.hpp b/LAPACK_CPP/include/lapack_cpp/base/enums.hpp index d3f929aca..6fd09bb0e 100644 --- a/LAPACK_CPP/include/lapack_cpp/base/enums.hpp +++ b/LAPACK_CPP/include/lapack_cpp/base/enums.hpp @@ -20,6 +20,7 @@ inline constexpr Op char2op(char t) return Op::ConjTrans; default: assert(false); + return Op::NoTrans; } } @@ -34,6 +35,7 @@ inline constexpr Side char2side(char t) return Side::Right; default: assert(false); + return Side::Left; } } @@ -48,6 +50,7 @@ inline constexpr Uplo char2uplo(char t) return Uplo::Lower; default: assert(false); + return Uplo::Upper; } } @@ -62,6 +65,7 @@ inline constexpr Diag char2diag(char t) return Diag::NonUnit; default: assert(false); + return Diag::NonUnit; } } @@ -76,6 +80,7 @@ inline constexpr Direction char2direct(char t) return Direction::Backward; default: assert(false); + return Direction::Forward; } } diff --git a/LAPACK_CPP/src/lapack_c/lartg_c.f90 b/LAPACK_CPP/src/lapack_c/lartg.f90 similarity index 100% rename from LAPACK_CPP/src/lapack_c/lartg_c.f90 rename to LAPACK_CPP/src/lapack_c/lartg.f90 diff --git a/LAPACK_CPP/src/lapack_c/lasr3_c.cpp b/LAPACK_CPP/src/lapack_c/lasr3.cpp similarity index 100% rename from LAPACK_CPP/src/lapack_c/lasr3_c.cpp rename to LAPACK_CPP/src/lapack_c/lasr3.cpp diff --git a/LAPACK_CPP/src/lapack_c/rot_c.f90 b/LAPACK_CPP/src/lapack_c/rot.f90 similarity index 100% rename from LAPACK_CPP/src/lapack_c/rot_c.f90 rename to LAPACK_CPP/src/lapack_c/rot.f90 diff --git a/LAPACK_CPP/src/lapack_cpp/lartg_cpp.cpp b/LAPACK_CPP/src/lapack_cpp/lartg.cpp similarity index 100% rename from LAPACK_CPP/src/lapack_cpp/lartg_cpp.cpp rename to LAPACK_CPP/src/lapack_cpp/lartg.cpp diff --git a/LAPACK_CPP/src/lapack_cpp/rot_cpp.cpp b/LAPACK_CPP/src/lapack_cpp/rot.cpp similarity index 100% rename from LAPACK_CPP/src/lapack_cpp/rot_cpp.cpp rename to LAPACK_CPP/src/lapack_cpp/rot.cpp From 0d9803a29c157961e29f2ddf530ba008a79502d7 Mon Sep 17 00:00:00 2001 From: thijssteel Date: Wed, 6 Mar 2024 11:58:41 +0100 Subject: [PATCH 06/10] add c wrapper to cpp rot --- LAPACK_CPP/Makefile | 25 ++++++------- LAPACK_CPP/src/lapack_c/rot.cpp | 65 +++++++++++++++++++++++++++++++++ LAPACK_CPP/src/lapack_c/rot.f90 | 4 +- 3 files changed, 79 insertions(+), 15 deletions(-) create mode 100644 LAPACK_CPP/src/lapack_c/rot.cpp diff --git a/LAPACK_CPP/Makefile b/LAPACK_CPP/Makefile index 46939582e..3badab025 100644 --- a/LAPACK_CPP/Makefile +++ b/LAPACK_CPP/Makefile @@ -14,30 +14,27 @@ HEADERS = $(wildcard include/lapack_c/*.h) \ $(wildcard include/lapack_cpp/*/*.hpp) \ $(wildcard include/lapack_cpp/*/*/*.hpp) -OBJFILES = src/lapack_cpp/rot.o \ - src/lapack_cpp/lartg.o \ - src/lapack_c/rot.o \ - src/lapack_c/lartg.o \ - src/lapack_c/lasr3.o \ - src/lapack_fortran/slasr3.o \ - src/lapack_fortran/dlasr3.o \ - src/lapack_fortran/clasr3.o \ - src/lapack_fortran/zlasr3.o \ +SRCFILES = $(wildcard src/lapack_c/*.cpp) \ + $(wildcard src/lapack_c/*.f90) \ + $(wildcard src/lapack_cpp/*.cpp) \ + $(wildcard src/lapack_fortran/*.f90) + +OBJFILES = $(patsubst %.f90, %_f90.o ,$(patsubst %.cpp,%_cpp.o,$(SRCFILES))) # Fortran files -src/lapack_fortran/%.o: src/lapack_fortran/%.f90 +src/lapack_fortran/%_f90.o: src/lapack_fortran/%.f90 $(FC) $(FFLAGS) -cpp -c -o $@ $< # C files -src/lapack_c/%.o: src/lapack_c/%.cpp $(HEADERS) +src/lapack_c/%_cpp.o: src/lapack_c/%.cpp $(HEADERS) $(CXX) $(CXXFLAGS) -I./include -c -o $@ $< -src/lapack_c/%.o: src/lapack_c/%.f90 - $(FC) $(FFLAGS) -c -o $@ $< +src/lapack_c/%_f90.o: src/lapack_c/%.f90 + $(FC) $(FFLAGS) -cpp -c -o $@ $< # C++ files -src/lapack_cpp/%.o: src/lapack_cpp/%.cpp $(HEADERS) +src/lapack_cpp/%_cpp.o: src/lapack_cpp/%.cpp $(HEADERS) $(CXX) $(CXXFLAGS) -I./include -c -o $@ $< # Test files diff --git a/LAPACK_CPP/src/lapack_c/rot.cpp b/LAPACK_CPP/src/lapack_c/rot.cpp new file mode 100644 index 000000000..f982276a2 --- /dev/null +++ b/LAPACK_CPP/src/lapack_c/rot.cpp @@ -0,0 +1,65 @@ +#ifndef USE_FORTRAN_BLAS +#include "lapack_c/rot.h" +#include "lapack_cpp/base.hpp" +#include "lapack_cpp/blas/rot.hpp" + +using namespace lapack_cpp; + +// Actual definitions +extern "C" +{ + + void lapack_c_drot(lapack_idx n, + double *x, + lapack_idx incx, + double *y, + lapack_idx incy, + double c, + double s) + { + Vector x_(n, x, incx); + Vector y_(n, y, incy); + rot(x_, y_, c, s); + } + + void lapack_c_srot(lapack_idx n, + float *x, + lapack_idx incx, + float *y, + lapack_idx incy, + float c, + float s) + { + Vector x_(n, x, incx); + Vector y_(n, y, incy); + rot(x_, y_, c, s); + } + + void lapack_c_crot(lapack_idx n, + lapack_float_complex *x, + lapack_idx incx, + lapack_float_complex *y, + lapack_idx incy, + float c, + lapack_float_complex s) + { + Vector x_(n, x, incx); + Vector y_(n, y, incy); + rot(x_, y_, c, s); + } + + void lapack_c_zrot(lapack_idx n, + lapack_double_complex *x, + lapack_idx incx, + lapack_double_complex *y, + lapack_idx incy, + double c, + lapack_double_complex s) + { + Vector x_(n, x, incx); + Vector y_(n, y, incy); + rot(x_, y_, c, s); + } + +} // extern "C" +#endif \ No newline at end of file diff --git a/LAPACK_CPP/src/lapack_c/rot.f90 b/LAPACK_CPP/src/lapack_c/rot.f90 index 6876c9036..60df52f41 100644 --- a/LAPACK_CPP/src/lapack_c/rot.f90 +++ b/LAPACK_CPP/src/lapack_c/rot.f90 @@ -1,3 +1,4 @@ +#ifdef USE_FORTRAN_BLAS subroutine lapack_c_drot(n, dx ,incx, dy, incy, c, s) bind(c, name='lapack_c_drot') use iso_c_binding implicit none @@ -54,4 +55,5 @@ subroutine lapack_c_zrot(n, dx ,incx, dy, incy, c, s) bind(c, name='lapack_c_zro ! Forward call to fortran implementation call zrot( n, dx ,incx, dy, incy, c, s ) -end subroutine lapack_c_zrot \ No newline at end of file +end subroutine lapack_c_zrot +#endif \ No newline at end of file From a4c4a4eb5261f37149a8b5216fc36a4b47298e68 Mon Sep 17 00:00:00 2001 From: thijssteel Date: Wed, 6 Mar 2024 14:17:19 +0100 Subject: [PATCH 07/10] add steqr3 --- .gitignore | 3 + LAPACK_CPP/include/lapack_c.h | 4 +- LAPACK_CPP/include/lapack_c/laev2.h | 19 + LAPACK_CPP/include/lapack_c/lasrt.h | 19 + LAPACK_CPP/include/lapack_cpp/base/enums.hpp | 15 + .../include/lapack_cpp/lapack/laev2.hpp | 55 +++ .../include/lapack_cpp/lapack/lasr3.hpp | 4 +- .../include/lapack_cpp/lapack/lasrt.hpp | 17 + .../include/lapack_cpp/lapack/steqr3.hpp | 428 ++++++++++++++++++ LAPACK_CPP/src/lapack_c/laev2.f90 | 25 + LAPACK_CPP/src/lapack_c/lasrt.f90 | 37 ++ LAPACK_CPP/src/lapack_cpp/laev2.cpp | 23 + LAPACK_CPP/src/lapack_cpp/lasrt.cpp | 51 +++ 13 files changed, 697 insertions(+), 3 deletions(-) create mode 100644 LAPACK_CPP/include/lapack_c/laev2.h create mode 100644 LAPACK_CPP/include/lapack_c/lasrt.h create mode 100644 LAPACK_CPP/include/lapack_cpp/lapack/laev2.hpp create mode 100644 LAPACK_CPP/include/lapack_cpp/lapack/lasrt.hpp create mode 100644 LAPACK_CPP/include/lapack_cpp/lapack/steqr3.hpp create mode 100644 LAPACK_CPP/src/lapack_c/laev2.f90 create mode 100644 LAPACK_CPP/src/lapack_c/lasrt.f90 create mode 100644 LAPACK_CPP/src/lapack_cpp/laev2.cpp create mode 100644 LAPACK_CPP/src/lapack_cpp/lasrt.cpp diff --git a/.gitignore b/.gitignore index 568e67b33..361dc7bd8 100644 --- a/.gitignore +++ b/.gitignore @@ -43,3 +43,6 @@ build* DOCS/man DOCS/explore-html output_err + +# Editor config files +.vscode/ \ No newline at end of file diff --git a/LAPACK_CPP/include/lapack_c.h b/LAPACK_CPP/include/lapack_c.h index 4212a73f4..f6ad2c42e 100644 --- a/LAPACK_CPP/include/lapack_c.h +++ b/LAPACK_CPP/include/lapack_c.h @@ -8,6 +8,8 @@ // LAPACK #include "lapack_c/lartg.h" - +#include "lapack_c/lasrt.h" +#include "lapack_c/laev2.h" +#include "lapack_c/lasrt.h" #endif \ No newline at end of file diff --git a/LAPACK_CPP/include/lapack_c/laev2.h b/LAPACK_CPP/include/lapack_c/laev2.h new file mode 100644 index 000000000..2cb5a6ad4 --- /dev/null +++ b/LAPACK_CPP/include/lapack_c/laev2.h @@ -0,0 +1,19 @@ +#ifndef LAPACK_C_LAEV2_HPP +#define LAPACK_C_LAEV2_HPP + +#include "lapack_c/util.h" + +#ifdef __cplusplus +extern "C" +{ +#endif // __cplusplus + + void lapack_c_slaev2(float a, float b, float c, float *rt1, float *rt2, float *cs1, float *sn1); + + void lapack_c_dlaev2(double a, double b, double c, double *rt1, double *rt2, double *cs1, double *sn1); + +#ifdef __cplusplus +} +#endif // __cplusplus + +#endif // LAPACK_C_LAEV2_HPP \ No newline at end of file diff --git a/LAPACK_CPP/include/lapack_c/lasrt.h b/LAPACK_CPP/include/lapack_c/lasrt.h new file mode 100644 index 000000000..6cf8a60d6 --- /dev/null +++ b/LAPACK_CPP/include/lapack_c/lasrt.h @@ -0,0 +1,19 @@ +#ifndef LAPACK_C_LASRT_HPP +#define LAPACK_C_LASRT_HPP + +#include "lapack_c/util.h" + +#ifdef __cplusplus +extern "C" +{ +#endif // __cplusplus + + void lapack_c_slasrt(char id, lapack_idx n, float* d, lapack_idx *info); + + void lapack_c_dlasrt(char id, lapack_idx n, double* d, lapack_idx *info); + +#ifdef __cplusplus +} +#endif // __cplusplus + +#endif // LAPACK_C_LASRT_HPP \ No newline at end of file diff --git a/LAPACK_CPP/include/lapack_cpp/base/enums.hpp b/LAPACK_CPP/include/lapack_cpp/base/enums.hpp index 6fd09bb0e..6bd6c5365 100644 --- a/LAPACK_CPP/include/lapack_cpp/base/enums.hpp +++ b/LAPACK_CPP/include/lapack_cpp/base/enums.hpp @@ -84,6 +84,21 @@ inline constexpr Direction char2direct(char t) } } +enum class IncDec : char { Increasing = 'I', Decreasing = 'D' }; + +inline constexpr IncDec char2incdec(char t) +{ + switch (t) { + case 'I': + return IncDec::Increasing; + case 'D': + return IncDec::Decreasing; + default: + assert(false); + return IncDec::Increasing; + } +} + } // namespace lapack_cpp #endif // LAPACK_CPP_ENUMP_HPP \ No newline at end of file diff --git a/LAPACK_CPP/include/lapack_cpp/lapack/laev2.hpp b/LAPACK_CPP/include/lapack_cpp/lapack/laev2.hpp new file mode 100644 index 000000000..72a105f8c --- /dev/null +++ b/LAPACK_CPP/include/lapack_cpp/lapack/laev2.hpp @@ -0,0 +1,55 @@ +#ifndef LAPACK_CPP_LAEV2_HPP +#define LAPACK_CPP_LAEV2_HPP + +#include "lapack_cpp/base.hpp" +#include "lapack_cpp/utils.hpp" + +namespace lapack_cpp { + +/** Computes the eigenvalues and eigenvector of a real symmetric 2x2 matrix A + * [ a b ] + * [ b c ] + * On exit, the decomposition satisfies: + * [ cs sn ] [ a b ] [ cs -sn ] = [ s1 0 ] + * [ -sn cs ] [ b c ] [ sn cs ] [ 0 s2 ] + * where cs*cs + sn*sn = 1. + * + * @param[in] a + * Element (0,0) of A. + * @param[in] b + * Element (0,1) and (1,0) of A. + * @param[in] c + * Element (1,1) of A. + * @param[out] s1 + * The eigenvalue of A with the largest absolute value. + * @param[out] s2 + * The eigenvalue of A with the smallest absolute value. + * @param[out] cs + * The cosine of the rotation matrix. + * @param[out] sn + * The sine of the rotation matrix. + * + * \verbatim + * s1 is accurate to a few ulps barring over/underflow. + * + * s2 may be inaccurate if there is massive cancellation in the + * determinant a*c-b*b; higher precision or correctly rounded or + * correctly truncated arithmetic would be needed to compute s2 + * accurately in all cases. + * + * cs and sn are accurate to a few ulps barring over/underflow. + * + * Overflow is possible only if s1 is within a factor of 5 of overflow. + * Underflow is harmless if the input data is 0 or exceeds + * underflow_threshold / macheps. + * \endverbatim + * + * + * @ingroup auxiliary + */ +template +void laev2(T a, T b, T c, T& s1, T& s2, T& cs, T& sn); + +} // namespace lapack_cpp + +#endif // LAPACK_CPP_LAEV2_HPP \ No newline at end of file diff --git a/LAPACK_CPP/include/lapack_cpp/lapack/lasr3.hpp b/LAPACK_CPP/include/lapack_cpp/lapack/lasr3.hpp index 535ca4ce6..a1ecfa66f 100644 --- a/LAPACK_CPP/include/lapack_cpp/lapack/lasr3.hpp +++ b/LAPACK_CPP/include/lapack_cpp/lapack/lasr3.hpp @@ -51,7 +51,7 @@ void lasr3(Side side, ConstMatrix, layout, idx_t> C, ConstMatrix S, Matrix A, - MemoryBlock work); + MemoryBlock& work); /** * Workspace query for lasr3. @@ -698,7 +698,7 @@ void lasr3(Side side, ConstMatrix, layout, idx_t> C, ConstMatrix S, Matrix A, - MemoryBlock work) + MemoryBlock& work) { const idx_t m = A.num_rows(); const idx_t n = A.num_columns(); diff --git a/LAPACK_CPP/include/lapack_cpp/lapack/lasrt.hpp b/LAPACK_CPP/include/lapack_cpp/lapack/lasrt.hpp new file mode 100644 index 000000000..d81fb99bb --- /dev/null +++ b/LAPACK_CPP/include/lapack_cpp/lapack/lasrt.hpp @@ -0,0 +1,17 @@ +#ifndef LAPACK_CPP_LASRT_HPP +#define LAPACK_CPP_LASRT_HPP + +#include "lapack_cpp/base.hpp" +#include "lapack_cpp/utils.hpp" + +namespace lapack_cpp { + +/** + * Sort the numbers in an array. + */ +template +void lasrt(IncDec id, Vector d); + +} // namespace lapack_cpp + +#endif // LAPACK_CPP_LASRT_HPP \ No newline at end of file diff --git a/LAPACK_CPP/include/lapack_cpp/lapack/steqr3.hpp b/LAPACK_CPP/include/lapack_cpp/lapack/steqr3.hpp new file mode 100644 index 000000000..d6420dd6a --- /dev/null +++ b/LAPACK_CPP/include/lapack_cpp/lapack/steqr3.hpp @@ -0,0 +1,428 @@ +#ifndef LAPACK_CPP_STEQR3_HPP +#define LAPACK_CPP_STEQR3_HPP + +#include "lapack_cpp/base.hpp" +#include "lapack_cpp/lapack/lasr3.hpp" +#include "lapack_cpp/lapack/lartg.hpp" +#include "lapack_cpp/lapack/lasrt.hpp" +#include "lapack_cpp/lapack/laev2.hpp" + +namespace lapack_cpp +{ + + /** + * STEQR3 computes all eigenvalues and, optionally, eigenvectors of a + * hermitian tridiagonal matrix using the implicit QL or QR method. + * The eigenvectors of a full or band hermitian matrix can also be found + * if SYTRD or SPTRD or SBTRD has been used to reduce this matrix to + * tridiagonal form. + * + * STEQR3 is a variant of STEQR that uses lasr3 to efficiently + * accumulate the rotations into the eigenvector matrix. + * + * @return 0 if success + * + * @param[in] want_z bool + * + * @param[in,out] d Real vector of length n. + * On entry, diagonal elements of the bidiagonal matrix B. + * On exit, the singular values of B in decreasing order. + * + * @param[in,out] e Real vector of length n-1. + * On entry, off-diagonal elements of the bidiagonal matrix B. + * On exit, the singular values of B in decreasing order. + * + * @param[in,out] Z n-by-m matrix. + * On entry, the n-by-n unitary matrix used in the reduction + * to tridiagonal form. + * On exit, if info = 0, and want_z=true then Z contains the + * orthonormal eigenvectors of the original Hermitian matrix. + * + * @ingroup computational + */ + template + int steqr3( + bool want_z, + Vector, idx_t> d, + Vector, idx_t> e, + Matrix Z, + MemoryBlock, idx_t, aligned> &rwork, + MemoryBlock &work) + { + using T_real = real_t; + + // constants + const T_real two(2); + const T_real one(1); + const T_real zero(0); + const idx_t n = size(d); + + // Amount of rotation sequences to generate before applying it. + const auto nb = idx_t{32}; + + // Quick return if possible + if (n == 0) + return 0; + if (n == 1) + { + if (want_z) + Z(0, 0) = one; + return 0; + } + + // Determine the unit roundoff and over/underflow thresholds. + const T_real eps = ulp(); + const T_real eps2 = eps * eps; + const T_real safmin = safe_min(); + + // Compute the eigenvalues and eigenvectors of the tridiagonal + // matrix. + const idx_t itmax = 30 * n; + + // istart and istop determine the active block + idx_t istart = 0; + idx_t istop = n; + + // Keep track of previous istart and istop to know when to change direction + idx_t istart_old = -1; + idx_t istop_old = -1; + + // Keep track of the delayed rotation sequences + idx_t i_block = 0; + + // C and S are used to store the rotation sequences + Matrix C(n - 1, nb, rwork); + auto rwork2 = rwork.remainder(n - 1, nb); + Matrix S(n - 1, nb, rwork2); + + // Initialize C and S to identity rotations + for (idx_t j = 0; j < nb; ++j) + { + for (idx_t i = 0; i < n - 1; ++i) + { + C(i, j) = one; + S(i, j) = zero; + } + } + + // Direction to chase the bulge + // This variable is reevaluated for every new subblock + Direction direction = d[0] > d[n - 1] ? Direction::Forward : Direction::Backward; + + // Main loop + for (idx_t iter = 0; iter < itmax; iter++) + { + // If we have saved up enough rotations, apply them + if (want_z and (i_block >= nb or iter == itmax or istop <= 1)) + { + idx_t i_block2 = min(i_block + 1, nb); + + // Find smallest index where rotation is not identity + idx_t i_start_block = n - 1; + for (idx_t i = 0; i < i_block2; ++i) + { + for (idx_t j = 0; j < i_start_block; ++j) + if (C(j, i) != one or S(j, i) != zero) + { + i_start_block = j; + break; + } + } + + // Find largest index where rotation is not identity + idx_t i_stop_block = 0; + for (idx_t i = 0; i < i_block2; ++i) + { + for (idx_t j2 = n - 1; j2 > i_stop_block; --j2) + { + idx_t j = j2 - 1; + if (C(j, i) != one or S(j, i) != zero) + { + i_stop_block = j; + break; + } + } + } + + if (i_start_block < i_stop_block + 1) + { + auto C2 = C.submatrix(i_start_block, i_stop_block + 1, 0, i_block2); + auto S2 = C.submatrix(i_start_block, i_stop_block + 1, 0, i_block2); + auto Z2 = Z.submatrix(0, n, i_start_block, i_stop_block + 2); + lasr3(Side::Right, direction, C2, S2, Z2, work); + } + // Reset block + i_block = 0; + + // Initialize C and S to identity rotations + for (idx_t j = 0; j < nb; ++j) + { + for (idx_t i = 0; i < n - 1; ++i) + { + C(i, j) = one; + S(i, j) = zero; + } + } + } + if (iter == itmax) + { + // The QR algorithm failed to converge, return with error. + return istop; + } + + if (istop <= 1) + { + // All eigenvalues have been found, exit and return 0. + break; + } + + // Find active block + for (idx_t i = istop - 1; i > istart; --i) + { + if ((e[i - 1] * e[i - 1]) <= + (eps2 * abs(d[i - 1])) * abs(d[i]) + safmin) + { + e[i - 1] = zero; + istart = i; + break; + } + } + + // An eigenvalue has split off, reduce istop and start the loop again + if (istart == istop - 1) + { + istop = istop - 1; + istart = 0; + continue; + } + + // A 2x2 block has split off, handle separately + if (istart + 1 == istop - 1) + { + T_real s1, s2; + if (want_z) + { + T_real cs, sn; + laev2(d[istart], e[istart], d[istart + 1], s1, s2, cs, sn); + + // Store rotation + C(istart, i_block) = cs; + S(istart, i_block) = sn; + // Normally, we would increment i_block here, but we do not + // because the block has been deflated and should not interfere + // with the next sequence. + } + else + { + lae2(d[istart], e[istart], d[istart + 1], s1, s2); + } + d[istart] = s1; + d[istart + 1] = s2; + e[istart] = zero; + + istop = istop - 2; + istart = 0; + continue; + } + + // Choose between QL and QR iteration + if (istart >= istop_old or istop <= istart_old) + { + Direction direction_new; + // We prefer to keep the current direction, because switching + // direction forces us to apply the rotations in the block, which + // may be inefficient if the block is small. + if (direction == Direction::Forward) + { + direction_new = 10. * abs(d[istart]) > abs(d[istop - 1]) ? Direction::Forward : Direction::Backward; + } + else + { + direction_new = + abs(d[istart]) > 10. * abs(d[istop - 1]) ? Direction::Forward : Direction::Backward; + } + + if (direction_new != direction) + { + // We don't want different directions in our saved rotation matrices + // So we apply all the current rotations and reset the block + idx_t i_block2 = min(i_block + 1, nb); + + // Find smallest index where rotation is not identity + idx_t i_start_block = n - 1; + for (idx_t i = 0; i < i_block2; ++i) + { + for (idx_t j = 0; j < i_start_block; ++j) + if (C(j, i) != one or S(j, i) != zero) + { + i_start_block = j; + break; + } + } + + // Find largest index where rotation is not identity + idx_t i_stop_block = 0; + for (idx_t i = 0; i < i_block2; ++i) + { + for (idx_t j2 = n - 1; j2 > i_stop_block; --j2) + { + idx_t j = j2 - 1; + if (C(j, i) != one or S(j, i) != zero) + { + i_stop_block = j; + break; + } + } + } + + if (i_start_block < i_stop_block + 1) + { + auto C2 = C.submatrix(i_start_block, i_stop_block + 1, 0, i_block2); + auto S2 = S.submatrix(i_start_block, i_stop_block + 1, 0, i_block2); + auto Z2 = Z.submatrix(0, n, i_start_block, i_stop_block + 2); + + lasr3(Side::Right, direction, C2, S2, Z2, work); + } + // Reset block + i_block = 0; + + // Initialize C and S to identity rotations + for (idx_t j = 0; j < nb; ++j) + { + for (idx_t i = 0; i < n - 1; ++i) + { + C(i, j) = one; + S(i, j) = zero; + } + } + } + direction = direction_new; + } + istart_old = istart; + istop_old = istop; + + if (direction == Direction::Forward) + { + // QR iteration + + // Form shift using last 2x2 block of the active matrix + T_real p = d[istop - 1]; + T_real g = (d[istop - 2] - p) / (two * e[istop - 2]); + T_real r = lapy2(g, one); + g = d[istart] - p + e[istop - 2] / (T_real)(g + (sgn(g) * r)); + + T_real s = one; + T_real c = one; + p = zero; + + // Chase bulge from top to bottom + for (idx_t i = istart; i < istop - 1; ++i) + { + T_real f = s * e[i]; + T_real b = c * e[i]; + lartg(g, f, c, s, r); + if (i != istart) + e[i - 1] = r; + g = d[i] - p; + r = (d[i + 1] - g) * s + two * c * b; + p = s * r; + d[i] = g + p; + g = c * r - b; + // If eigenvalues are desired, then apply rotations + if (want_z) + { + // Store rotation for later + C(i, i_block) = c; + S(i, i_block) = s; + } + } + d[istop - 1] = d[istop - 1] - p; + e[istop - 2] = g; + } + else + { + // QL iteration + + // Form shift using last 2x2 block of the active matrix + T_real p = d[istart]; + T_real g = (d[istart + 1] - p) / (two * e[istart]); + T_real r = lapy2(g, one); + g = d[istop - 1] - p + e[istart] / (T_real)(g + (sgn(g) * r)); + + T_real s = one; + T_real c = one; + p = zero; + + // Chase bulge from bottom to top + for (idx_t i = istop - 1; i > istart; --i) + { + T_real f = s * e[i - 1]; + T_real b = c * e[i - 1]; + lartg(g, f, c, s, r); + if (i != istop - 1) + e[i] = r; + g = d[i] - p; + r = (d[i - 1] - g) * s + two * c * b; + p = s * r; + d[i] = g + p; + g = c * r - b; + // If eigenvalues are desired, then apply rotations + if (want_z) + { + // Store rotation for later + C(i - 1, i_block) = c; + S(i - 1, i_block) = -s; + } + } + d[istart] = d[istart] - p; + e[istart] = g; + } + i_block++; + } + + // Order eigenvalues and eigenvectors + if (!want_z) + { + // Order eigenvalues + lasrt(IncDec::Increasing, d); + } + else + { + // Use selection sort to minize swaps of eigenvectors + for (idx_t i = 0; i < n - 1; ++i) + { + idx_t k = i; + T_real p = d[i]; + for (idx_t j = i + 1; j < n; ++j) + { + if (d[j] < p) + { + k = j; + p = d[j]; + } + } + if (k != i) + { + d[k] = d[i]; + d[i] = p; + auto z1 = col(Z, i); + auto z2 = col(Z, k); + for (idx_t j = 0; j < n; ++j) + { + auto temp = z1[j]; + z1[j] = z2[j]; + z2[j] = temp; + } + } + } + } + + return 0; + } + +} + +#endif \ No newline at end of file diff --git a/LAPACK_CPP/src/lapack_c/laev2.f90 b/LAPACK_CPP/src/lapack_c/laev2.f90 new file mode 100644 index 000000000..0fc787ff3 --- /dev/null +++ b/LAPACK_CPP/src/lapack_c/laev2.f90 @@ -0,0 +1,25 @@ +subroutine lapack_c_slaev2(a, b, c, rt1, rt2, cs1, sn1) bind(c, name='lapack_c_slaev2') + use iso_c_binding, only: c_float + implicit none + real(kind=c_float), intent(in), value :: a, b, c + real(kind=c_float), intent(out) :: rt1, rt2, cs1, sn1 + + external slaev2 + + ! Forward call to fortran implementation + call slaev2( a, b, c, rt1, rt2, cs1, sn1) + +end subroutine lapack_c_slaev2 + +subroutine lapack_c_dlaev2(a, b, c, rt1, rt2, cs1, sn1) bind(c, name='lapack_c_dlaev2') + use iso_c_binding, only: c_double + implicit none + real(kind=c_double), intent(in), value :: a, b, c + real(kind=c_double), intent(out) :: rt1, rt2, cs1, sn1 + + external dlaev2 + + ! Forward call to fortran implementation + call dlaev2( a, b, c, rt1, rt2, cs1, sn1) + +end subroutine lapack_c_dlaev2 \ No newline at end of file diff --git a/LAPACK_CPP/src/lapack_c/lasrt.f90 b/LAPACK_CPP/src/lapack_c/lasrt.f90 new file mode 100644 index 000000000..560a2990b --- /dev/null +++ b/LAPACK_CPP/src/lapack_c/lasrt.f90 @@ -0,0 +1,37 @@ +subroutine lapack_c_slasrt(id, n, d, info) bind(c, name='lapack_c_slasrt') + use iso_c_binding, only: c_char, c_int, c_float + implicit none + character(kind=c_char), intent(in), value :: id + integer(kind=c_int), intent(in), value :: n + real(kind=c_float), intent(inout) :: d(*) + integer(kind=c_int), intent(out) :: info + + character :: id_fortran + + external slasrt + + ! Convert the character to fortran + id_fortran = id + ! Forward call to fortran implementation + call slasrt( id_fortran, n, d, info) + +end subroutine lapack_c_slasrt + +subroutine lapack_c_dlasrt(id, n, d, info) bind(c, name='lapack_c_dlasrt') + use iso_c_binding, only: c_char, c_int, c_double + implicit none + character(kind=c_char), intent(in), value :: id + integer(kind=c_int), intent(in), value :: n + real(kind=c_double), intent(inout) :: d(*) + integer(kind=c_int), intent(out) :: info + + character :: id_fortran + + external dlasrt + + ! Convert the character to fortran + id_fortran = id + ! Forward call to fortran implementation + call dlasrt( id_fortran, n, d, info) + +end subroutine lapack_c_dlasrt \ No newline at end of file diff --git a/LAPACK_CPP/src/lapack_cpp/laev2.cpp b/LAPACK_CPP/src/lapack_cpp/laev2.cpp new file mode 100644 index 000000000..2b74e53bc --- /dev/null +++ b/LAPACK_CPP/src/lapack_cpp/laev2.cpp @@ -0,0 +1,23 @@ +#include +#include +#include + +#include "lapack_c.h" +#include "lapack_cpp/base.hpp" +#include "lapack_cpp/lapack/laev2.hpp" + +namespace lapack_cpp { + +template <> +void laev2(float a, float b, float c, float& s1, float& s2, float& cs, float& sn) +{ + lapack_c_slaev2(a, b, c, &s1, &s2, &cs, &sn); +} + +template <> +void laev2(double a, double b, double c, double& s1, double& s2, double& cs, double& sn) +{ + lapack_c_dlaev2(a, b, c, &s1, &s2, &cs, &sn); +} + +} // namespace lapack_cpp \ No newline at end of file diff --git a/LAPACK_CPP/src/lapack_cpp/lasrt.cpp b/LAPACK_CPP/src/lapack_cpp/lasrt.cpp new file mode 100644 index 000000000..308059887 --- /dev/null +++ b/LAPACK_CPP/src/lapack_cpp/lasrt.cpp @@ -0,0 +1,51 @@ +#include +#include +#include + +#include "lapack_c.h" +#include "lapack_cpp/base.hpp" +#include "lapack_cpp/lapack/lasrt.hpp" + +namespace lapack_cpp { + +template <> +void lasrt(IncDec id, Vector d) +{ + assert(d.size() >= 0); + // fortran code only supports stride 1 + assert(d.stride() == 1); + lapack_idx info; + lapack_c_slasrt((char) id, d.size(), d.ptr(), &info); +} + +template <> +void lasrt(IncDec id, Vector d) +{ + assert(d.size() >= 0); + // fortran code only supports stride 1 + assert(d.stride() == 1); + lapack_idx info; + lapack_c_dlasrt((char) id, d.size(), d.ptr(), &info); +} + +template <> +void lasrt(IncDec id, Vector d) +{ + assert(d.size() >= 0); + // fortran code only supports stride 1 + assert(d.stride() == 1); + lapack_idx info; + lapack_c_slasrt((char) id, d.size(), d.ptr(), &info); +} + +template <> +void lasrt(IncDec id, Vector d) +{ + assert(d.size() >= 0); + // fortran code only supports stride 1 + assert(d.stride() == 1); + lapack_idx info; + lapack_c_dlasrt((char) id, d.size(), d.ptr(), &info); +} + +} // namespace lapack_cpp \ No newline at end of file From b44c64faefb637cc24a56033065d9a104e32b499 Mon Sep 17 00:00:00 2001 From: thijssteel Date: Wed, 6 Mar 2024 15:58:48 +0100 Subject: [PATCH 08/10] add steqr3 --- LAPACK_CPP/Makefile | 9 +- LAPACK_CPP/example/.gitignore | 3 +- LAPACK_CPP/example/profile_steqr3.cpp | 82 +++++++++++++++++++ LAPACK_CPP/include/lapack_c.h | 1 + LAPACK_CPP/include/lapack_c/lae2.h | 19 +++++ LAPACK_CPP/include/lapack_cpp/base/matrix.hpp | 10 +++ LAPACK_CPP/include/lapack_cpp/base/types.hpp | 32 ++++++++ LAPACK_CPP/include/lapack_cpp/lapack.hpp | 4 + LAPACK_CPP/include/lapack_cpp/lapack/lae2.hpp | 18 ++++ .../include/lapack_cpp/lapack/lapy2.hpp | 34 ++++++++ .../include/lapack_cpp/lapack/lartg.hpp | 2 +- .../include/lapack_cpp/lapack/steqr3.hpp | 67 +++++++++++++-- LAPACK_CPP/src/lapack_c/lae2.f90 | 25 ++++++ LAPACK_CPP/src/lapack_cpp/lae2.cpp | 23 ++++++ 14 files changed, 313 insertions(+), 16 deletions(-) create mode 100644 LAPACK_CPP/example/profile_steqr3.cpp create mode 100644 LAPACK_CPP/include/lapack_c/lae2.h create mode 100644 LAPACK_CPP/include/lapack_cpp/lapack/lae2.hpp create mode 100644 LAPACK_CPP/include/lapack_cpp/lapack/lapy2.hpp create mode 100644 LAPACK_CPP/src/lapack_c/lae2.f90 create mode 100644 LAPACK_CPP/src/lapack_cpp/lae2.cpp diff --git a/LAPACK_CPP/Makefile b/LAPACK_CPP/Makefile index 3badab025..80660b628 100644 --- a/LAPACK_CPP/Makefile +++ b/LAPACK_CPP/Makefile @@ -1,11 +1,11 @@ CXX=g++ -# CXXFLAGS=-std=c++17 -Wall -Wextra -pedantic -g -Og +# CXXFLAGS=-std=c++17 -Wall -Wextra -pedantic -g CXXFLAGS=-std=c++17 -O3 -march=native -ffast-math -g -DNDEBUG FC=gfortran FFLAGS=-Wall -Wextra -pedantic -g -all: ./example/test_lasr3 ./example/profile_lasr3 +all: ./example/test_lasr3 ./example/profile_lasr3 ./example/profile_steqr3 $(OBJFILES) HEADERS = $(wildcard include/lapack_c/*.h) \ $(wildcard include/lapack_c/*/*.h) \ @@ -38,10 +38,7 @@ src/lapack_cpp/%_cpp.o: src/lapack_cpp/%.cpp $(HEADERS) $(CXX) $(CXXFLAGS) -I./include -c -o $@ $< # Test files -example/test_lasr3: example/test_lasr3.cpp $(OBJFILES) $(HEADERS) - $(CXX) $(CXXFLAGS) -o $@ $< $(OBJFILES) -I./include -lstdc++ -lgfortran -lblas -llapack - -example/profile_lasr3: example/profile_lasr3.cpp $(OBJFILES) $(HEADERS) +example/%: example/%.cpp $(OBJFILES) $(HEADERS) $(CXX) $(CXXFLAGS) -o $@ $< $(OBJFILES) -I./include -lstdc++ -lgfortran -lblas -llapack clean: diff --git a/LAPACK_CPP/example/.gitignore b/LAPACK_CPP/example/.gitignore index ed20f170e..d92df4173 100644 --- a/LAPACK_CPP/example/.gitignore +++ b/LAPACK_CPP/example/.gitignore @@ -1,2 +1,3 @@ test_lasr3 -profile_lasr3 \ No newline at end of file +profile_lasr3 +profile_steqr3 \ No newline at end of file diff --git a/LAPACK_CPP/example/profile_steqr3.cpp b/LAPACK_CPP/example/profile_steqr3.cpp new file mode 100644 index 000000000..d0f20db82 --- /dev/null +++ b/LAPACK_CPP/example/profile_steqr3.cpp @@ -0,0 +1,82 @@ + +#include // for high_resolution_clock +#include +#include + +#include "lapack_cpp.hpp" + +using namespace lapack_cpp; + +template +void profile_steqr3(idx_t n) +{ + MemoryBlock Z_(n, n, layout); + Matrix Z(n, n, Z_); + + MemoryBlock d_(n); + Vector d(n, d_); + + MemoryBlock e_(n - 1); + Vector e(n - 1, e_); + + MemoryBlock d_copy_(n); + Vector d_copy(n, d_copy_); + + MemoryBlock e_copy_(n - 1); + Vector e_copy(n - 1, e_copy_); + + randomize(d); + randomize(e); + + d_copy = d; + e_copy = e; + + bool want_z = true; + + MemoryBlock work(steqr3_workquery(want_z, d, e, Z)); + MemoryBlock, idx_t> rwork(steqr3_rworkquery(want_z, d, e, Z)); + + const idx_t n_timings = 100; + const idx_t n_warmup = 50; + std::vector timings(n_timings); + + for (idx_t i = 0; i < n_timings; ++i) + { + d = d_copy; + e = e_copy; + auto start = std::chrono::high_resolution_clock::now(); + steqr3(want_z, d, e, Z, rwork, work); + auto end = std::chrono::high_resolution_clock::now(); + timings[i] = std::chrono::duration(end - start).count(); + std::cout<(n); + } + return 0; +} \ No newline at end of file diff --git a/LAPACK_CPP/include/lapack_c.h b/LAPACK_CPP/include/lapack_c.h index f6ad2c42e..70e416e45 100644 --- a/LAPACK_CPP/include/lapack_c.h +++ b/LAPACK_CPP/include/lapack_c.h @@ -9,6 +9,7 @@ // LAPACK #include "lapack_c/lartg.h" #include "lapack_c/lasrt.h" +#include "lapack_c/lae2.h" #include "lapack_c/laev2.h" #include "lapack_c/lasrt.h" diff --git a/LAPACK_CPP/include/lapack_c/lae2.h b/LAPACK_CPP/include/lapack_c/lae2.h new file mode 100644 index 000000000..881db98ff --- /dev/null +++ b/LAPACK_CPP/include/lapack_c/lae2.h @@ -0,0 +1,19 @@ +#ifndef LAPACK_C_LAE2_HPP +#define LAPACK_C_LAE2_HPP + +#include "lapack_c/util.h" + +#ifdef __cplusplus +extern "C" +{ +#endif // __cplusplus + + void lapack_c_slae2(float a, float b, float c, float *rt1, float *rt2); + + void lapack_c_dlae2(double a, double b, double c, double *rt1, double *rt2); + +#ifdef __cplusplus +} +#endif // __cplusplus + +#endif // LAPACK_C_LAE2_HPP \ No newline at end of file diff --git a/LAPACK_CPP/include/lapack_cpp/base/matrix.hpp b/LAPACK_CPP/include/lapack_cpp/base/matrix.hpp index ad10aae8c..d97b18052 100644 --- a/LAPACK_CPP/include/lapack_cpp/base/matrix.hpp +++ b/LAPACK_CPP/include/lapack_cpp/base/matrix.hpp @@ -66,6 +66,11 @@ class Matrix { assert(ldim >= (layout == Layout::ColMajor ? m : n)); } + // Constructor for a matrix of size m x n + Matrix(const idx_t m, const idx_t n, T* ptr) + : m_(m), n_(n), ldim_(layout == Layout::ColMajor ? m : n), data_(ptr) + {} + // Copy constructor Matrix(const Matrix& m) : m_(m.num_rows()), n_(m.num_columns()), ldim_(m.ldim()), data_(m.ptr()) @@ -222,6 +227,11 @@ class ConstMatrix { assert(ldim >= (layout == Layout::ColMajor ? m : n)); } + // Constructor for a matrix of size m x n + ConstMatrix(const idx_t m, const idx_t n, const T* ptr) + : m_(m), n_(n), ldim_(layout == Layout::ColMajor ? m : n), data_(ptr) + {} + // Copy constructor ConstMatrix(const ConstMatrix& m) : m_(m.num_rows()), n_(m.num_columns()), ldim_(m.ldim()), data_(m.ptr()) diff --git a/LAPACK_CPP/include/lapack_cpp/base/types.hpp b/LAPACK_CPP/include/lapack_cpp/base/types.hpp index 2ce546bae..3b4475c27 100644 --- a/LAPACK_CPP/include/lapack_cpp/base/types.hpp +++ b/LAPACK_CPP/include/lapack_cpp/base/types.hpp @@ -30,4 +30,36 @@ struct real_type_struct> { template using real_t = typename real_type_struct::type; + +template +constexpr real_t ulp() noexcept +{ + return std::numeric_limits>::epsilon(); +} + +template +constexpr real_t safe_min() noexcept +{ + const int fradix = std::numeric_limits>::radix; + const int expm = std::numeric_limits>::min_exponent; + const int expM = std::numeric_limits>::max_exponent; + + return pow(fradix, real_t(std::max(expm - 1, 1 - expM))); +} + +/// Sign function for real numbers +/// Note, this has the following behavior: +/// sgn(x) = 1 if x > 0 +/// sgn(x) = -1 if x < 0 +/// sgn(0) = 1 +/// sgn(-0) = 1 +/// sgn(+Inf) = 1 +/// sgn(-Inf) = 1 +/// sgn(NaN) = -1 +template +constexpr T sgn(const T& val) +{ + return (val >= T(0)) ? T(1) : T(-1); +} + #endif // LAPACK_CPP_BASE_TYPES_HPP \ No newline at end of file diff --git a/LAPACK_CPP/include/lapack_cpp/lapack.hpp b/LAPACK_CPP/include/lapack_cpp/lapack.hpp index 31e03ce78..bb515fbf2 100644 --- a/LAPACK_CPP/include/lapack_cpp/lapack.hpp +++ b/LAPACK_CPP/include/lapack_cpp/lapack.hpp @@ -1,7 +1,11 @@ #ifndef LAPACK_CPP_LAPACK_HPP #define LAPACK_CPP_LAPACK_HPP +#include "lapack_cpp/lapack/lae2.hpp" +#include "lapack_cpp/lapack/laev2.hpp" #include "lapack_cpp/lapack/lartg.hpp" #include "lapack_cpp/lapack/lasr3.hpp" +#include "lapack_cpp/lapack/lasrt.hpp" +#include "lapack_cpp/lapack/steqr3.hpp" #endif // LAPACK_CPP_LAPACK_HPP \ No newline at end of file diff --git a/LAPACK_CPP/include/lapack_cpp/lapack/lae2.hpp b/LAPACK_CPP/include/lapack_cpp/lapack/lae2.hpp new file mode 100644 index 000000000..0e907d9a1 --- /dev/null +++ b/LAPACK_CPP/include/lapack_cpp/lapack/lae2.hpp @@ -0,0 +1,18 @@ +#ifndef LAPACK_CPP_LAE2_HPP +#define LAPACK_CPP_LAE2_HPP + +#include "lapack_cpp/base.hpp" +#include "lapack_cpp/utils.hpp" + +namespace lapack_cpp { + +/** Computes the eigenvalues of a real symmetric 2x2 matrix A + * [ a b ] + * [ b c ] + */ +template +void lae2(T a, T b, T c, T& s1, T& s2); + +} // namespace lapack_cpp + +#endif // LAPACK_CPP_LAE2_HPP \ No newline at end of file diff --git a/LAPACK_CPP/include/lapack_cpp/lapack/lapy2.hpp b/LAPACK_CPP/include/lapack_cpp/lapack/lapy2.hpp new file mode 100644 index 000000000..f43de452d --- /dev/null +++ b/LAPACK_CPP/include/lapack_cpp/lapack/lapy2.hpp @@ -0,0 +1,34 @@ +#ifndef LAPACK_CPP_LAPY2_HPP +#define LAPACK_CPP_LAPY2_HPP + +#include "lapack_cpp/base.hpp" +#include "lapack_cpp/utils.hpp" +namespace lapack_cpp { + +/** + * Return \f$ \sqrt{f^2 + g^2} \f$ taking care to avoid overflow and underflow. + */ +template +T lapy2(T x, T y){ + // constants + const T one(1); + const T zero(0); + const T xabs = abs(x); + const T yabs = abs(y); + + T w, z; + if (xabs > yabs) { + w = xabs; + z = yabs; + } + else { + w = yabs; + z = xabs; + } + + return (z == zero) ? w : w * sqrt(one + (z / w) * (z / w)); +} + +} // namespace lapack_cpp + +#endif // LAPACK_CPP_LAPY2_HPP \ No newline at end of file diff --git a/LAPACK_CPP/include/lapack_cpp/lapack/lartg.hpp b/LAPACK_CPP/include/lapack_cpp/lapack/lartg.hpp index 5f5b345dd..d479abbc1 100644 --- a/LAPACK_CPP/include/lapack_cpp/lapack/lartg.hpp +++ b/LAPACK_CPP/include/lapack_cpp/lapack/lartg.hpp @@ -13,4 +13,4 @@ void lartg(T f, T g, real_t& c, T& s, T& r); } // namespace lapack_cpp -#endif // LAPACK_CPP_GEMV_HPP \ No newline at end of file +#endif // LAPACK_CPP_LARTG_HPP \ No newline at end of file diff --git a/LAPACK_CPP/include/lapack_cpp/lapack/steqr3.hpp b/LAPACK_CPP/include/lapack_cpp/lapack/steqr3.hpp index d6420dd6a..38333c512 100644 --- a/LAPACK_CPP/include/lapack_cpp/lapack/steqr3.hpp +++ b/LAPACK_CPP/include/lapack_cpp/lapack/steqr3.hpp @@ -6,9 +6,54 @@ #include "lapack_cpp/lapack/lartg.hpp" #include "lapack_cpp/lapack/lasrt.hpp" #include "lapack_cpp/lapack/laev2.hpp" +#include "lapack_cpp/lapack/lae2.hpp" +#include "lapack_cpp/lapack/lapy2.hpp" + namespace lapack_cpp { + template + idx_t steqr3_workquery(bool want_z, + Vector, idx_t> d, + Vector, idx_t> e, + Matrix Z) + { + const auto nb = idx_t{32}; + const idx_t n = d.size(); + + assert(n == e.size() + 1); + assert(n == Z.num_columns()); + assert(n == Z.num_rows()); + + // Create empty wrappers for C and S to do workspace query + ConstMatrix, layout, idx_t> C(n - 1, nb, nullptr); + ConstMatrix, layout, idx_t> S(n - 1, nb, nullptr); + + return lasr3_workquery(Side::Right, Direction::Forward, C, S, Z); + } + + template + idx_t steqr3_rworkquery(bool want_z, + Vector, idx_t> d, + Vector, idx_t> e, + Matrix Z) + { + const auto nb = idx_t{32}; + const idx_t n = d.size(); + + assert(n == e.size() + 1); + assert(n == Z.num_columns()); + assert(n == Z.num_rows()); + + if (layout == Layout::ColMajor) + return 2 * calc_ld, idx_t>(n - 1) * nb; + else + return 2 * (n - 1) * calc_ld, idx_t>(nb); + } /** * STEQR3 computes all eigenvalues and, optionally, eigenvectors of a @@ -58,7 +103,13 @@ namespace lapack_cpp const T_real two(2); const T_real one(1); const T_real zero(0); - const idx_t n = size(d); + const idx_t n = d.size(); + + assert(n == e.size() + 1); + assert(n == Z.num_columns()); + assert(n == Z.num_rows()); + assert( rwork.size() >= steqr3_rworkquery(want_z, d, e, Z) ); + assert( work.size() >= steqr3_workquery(want_z, d, e, Z) ); // Amount of rotation sequences to generate before applying it. const auto nb = idx_t{32}; @@ -118,7 +169,7 @@ namespace lapack_cpp // If we have saved up enough rotations, apply them if (want_z and (i_block >= nb or iter == itmax or istop <= 1)) { - idx_t i_block2 = min(i_block + 1, nb); + idx_t i_block2 = std::min(i_block + 1, nb); // Find smallest index where rotation is not identity idx_t i_start_block = n - 1; @@ -150,9 +201,9 @@ namespace lapack_cpp if (i_start_block < i_stop_block + 1) { auto C2 = C.submatrix(i_start_block, i_stop_block + 1, 0, i_block2); - auto S2 = C.submatrix(i_start_block, i_stop_block + 1, 0, i_block2); + auto S2 = S.submatrix(i_start_block, i_stop_block + 1, 0, i_block2); auto Z2 = Z.submatrix(0, n, i_start_block, i_stop_block + 2); - lasr3(Side::Right, direction, C2, S2, Z2, work); + lasr3(Side::Right, direction, C2.as_const(), S2.as_const(), Z2, work); } // Reset block i_block = 0; @@ -249,7 +300,7 @@ namespace lapack_cpp { // We don't want different directions in our saved rotation matrices // So we apply all the current rotations and reset the block - idx_t i_block2 = min(i_block + 1, nb); + idx_t i_block2 = std::min(i_block + 1, nb); // Find smallest index where rotation is not identity idx_t i_start_block = n - 1; @@ -284,7 +335,7 @@ namespace lapack_cpp auto S2 = S.submatrix(i_start_block, i_stop_block + 1, 0, i_block2); auto Z2 = Z.submatrix(0, n, i_start_block, i_stop_block + 2); - lasr3(Side::Right, direction, C2, S2, Z2, work); + lasr3(Side::Right, direction, C2.as_const(), S2.as_const(), Z2, work); } // Reset block i_block = 0; @@ -408,8 +459,8 @@ namespace lapack_cpp { d[k] = d[i]; d[i] = p; - auto z1 = col(Z, i); - auto z2 = col(Z, k); + auto z1 = Z.column(i); + auto z2 = Z.column(k); for (idx_t j = 0; j < n; ++j) { auto temp = z1[j]; diff --git a/LAPACK_CPP/src/lapack_c/lae2.f90 b/LAPACK_CPP/src/lapack_c/lae2.f90 new file mode 100644 index 000000000..083a77d4e --- /dev/null +++ b/LAPACK_CPP/src/lapack_c/lae2.f90 @@ -0,0 +1,25 @@ +subroutine lapack_c_slae2(a, b, c, rt1, rt2) bind(c, name='lapack_c_slae2') + use iso_c_binding, only: c_float + implicit none + real(kind=c_float), intent(in), value :: a, b, c + real(kind=c_float), intent(out) :: rt1, rt2 + + external slae2 + + ! Forward call to fortran implementation + call slae2( a, b, c, rt1, rt2) + +end subroutine lapack_c_slae2 + +subroutine lapack_c_dlae2(a, b, c, rt1, rt2) bind(c, name='lapack_c_dlae2') + use iso_c_binding, only: c_double + implicit none + real(kind=c_double), intent(in), value :: a, b, c + real(kind=c_double), intent(out) :: rt1, rt2 + + external dlae2 + + ! Forward call to fortran implementation + call dlae2( a, b, c, rt1, rt2) + +end subroutine lapack_c_dlae2 \ No newline at end of file diff --git a/LAPACK_CPP/src/lapack_cpp/lae2.cpp b/LAPACK_CPP/src/lapack_cpp/lae2.cpp new file mode 100644 index 000000000..d1396edda --- /dev/null +++ b/LAPACK_CPP/src/lapack_cpp/lae2.cpp @@ -0,0 +1,23 @@ +#include +#include +#include + +#include "lapack_c.h" +#include "lapack_cpp/base.hpp" +#include "lapack_cpp/lapack/lae2.hpp" + +namespace lapack_cpp { + +template <> +void lae2(float a, float b, float c, float& s1, float& s2) +{ + lapack_c_slae2(a, b, c, &s1, &s2); +} + +template <> +void lae2(double a, double b, double c, double& s1, double& s2) +{ + lapack_c_dlae2(a, b, c, &s1, &s2); +} + +} // namespace lapack_cpp \ No newline at end of file From 7a2ecc24e52f8dbe567dd488b61d2790b570ae7c Mon Sep 17 00:00:00 2001 From: thijssteel Date: Wed, 6 Mar 2024 16:33:46 +0100 Subject: [PATCH 09/10] add fortran wrappers for steqr3 --- LAPACK_CPP/example/profile_steqr3.cpp | 8 +- LAPACK_CPP/include/lapack_c.h | 1 + LAPACK_CPP/include/lapack_c/steqr3.h | 63 +++++++ LAPACK_CPP/include/lapack_cpp/base/enums.hpp | 17 ++ .../include/lapack_cpp/lapack/steqr3.hpp | 34 +++- LAPACK_CPP/src/lapack_c/steqr3.cpp | 169 ++++++++++++++++++ LAPACK_CPP/src/lapack_fortran/csteqr3.f90 | 31 ++++ LAPACK_CPP/src/lapack_fortran/dsteqr3.f90 | 29 +++ LAPACK_CPP/src/lapack_fortran/ssteqr3.f90 | 29 +++ LAPACK_CPP/src/lapack_fortran/zsteqr3.f90 | 31 ++++ 10 files changed, 402 insertions(+), 10 deletions(-) create mode 100644 LAPACK_CPP/include/lapack_c/steqr3.h create mode 100644 LAPACK_CPP/src/lapack_c/steqr3.cpp create mode 100644 LAPACK_CPP/src/lapack_fortran/csteqr3.f90 create mode 100644 LAPACK_CPP/src/lapack_fortran/dsteqr3.f90 create mode 100644 LAPACK_CPP/src/lapack_fortran/ssteqr3.f90 create mode 100644 LAPACK_CPP/src/lapack_fortran/zsteqr3.f90 diff --git a/LAPACK_CPP/example/profile_steqr3.cpp b/LAPACK_CPP/example/profile_steqr3.cpp index d0f20db82..517dd4b86 100644 --- a/LAPACK_CPP/example/profile_steqr3.cpp +++ b/LAPACK_CPP/example/profile_steqr3.cpp @@ -33,10 +33,10 @@ void profile_steqr3(idx_t n) d_copy = d; e_copy = e; - bool want_z = true; + CompQ compz = CompQ::Initialize; - MemoryBlock work(steqr3_workquery(want_z, d, e, Z)); - MemoryBlock, idx_t> rwork(steqr3_rworkquery(want_z, d, e, Z)); + MemoryBlock work(steqr3_workquery(compz, d, e, Z)); + MemoryBlock, idx_t> rwork(steqr3_rworkquery(compz, d, e, Z)); const idx_t n_timings = 100; const idx_t n_warmup = 50; @@ -47,7 +47,7 @@ void profile_steqr3(idx_t n) d = d_copy; e = e_copy; auto start = std::chrono::high_resolution_clock::now(); - steqr3(want_z, d, e, Z, rwork, work); + steqr3(compz, d, e, Z, work, rwork); auto end = std::chrono::high_resolution_clock::now(); timings[i] = std::chrono::duration(end - start).count(); std::cout< - idx_t steqr3_workquery(bool want_z, + idx_t steqr3_workquery(CompQ compz, Vector, idx_t> d, Vector, idx_t> e, Matrix Z) { + if( compz == CompQ::No ) + return 0; const auto nb = idx_t{32}; const idx_t n = d.size(); @@ -37,7 +39,7 @@ namespace lapack_cpp template - idx_t steqr3_rworkquery(bool want_z, + idx_t steqr3_rworkquery(CompQ compz, Vector, idx_t> d, Vector, idx_t> e, Matrix Z) @@ -82,6 +84,12 @@ namespace lapack_cpp * to tridiagonal form. * On exit, if info = 0, and want_z=true then Z contains the * orthonormal eigenvectors of the original Hermitian matrix. + * + * @param[out] work Workspace array whose size is returned by + * steqr3_workquery. + * + * @param[out] rwork Real workspace array whose size is returned by + * steqr3_rworkquery. * * @ingroup computational */ @@ -90,12 +98,12 @@ namespace lapack_cpp typename idx_t, bool aligned> int steqr3( - bool want_z, + CompQ compz, Vector, idx_t> d, Vector, idx_t> e, Matrix Z, - MemoryBlock, idx_t, aligned> &rwork, - MemoryBlock &work) + MemoryBlock &work, + MemoryBlock, idx_t, aligned> &rwork) { using T_real = real_t; @@ -119,11 +127,25 @@ namespace lapack_cpp return 0; if (n == 1) { - if (want_z) + if (compz == CompQ::Initialize) Z(0, 0) = one; return 0; } + bool want_z = compz != CompQ::No; + + if( compz == CompQ::Initialize ) + { + for (idx_t j = 0; j < n; ++j) + { + for (idx_t i = 0; i < n; ++i) + { + Z(i, j) = zero; + } + Z(j, j) = one; + } + } + // Determine the unit roundoff and over/underflow thresholds. const T_real eps = ulp(); const T_real eps2 = eps * eps; diff --git a/LAPACK_CPP/src/lapack_c/steqr3.cpp b/LAPACK_CPP/src/lapack_c/steqr3.cpp new file mode 100644 index 000000000..61f1fd64f --- /dev/null +++ b/LAPACK_CPP/src/lapack_c/steqr3.cpp @@ -0,0 +1,169 @@ +#include "lapack_c/steqr3.h" +#include "lapack_cpp/base.hpp" +#include "lapack_cpp/lapack/steqr3.hpp" + +using namespace lapack_cpp; + +// Actual definitions +extern "C" +{ + + void lapack_c_ssteqr3(char compz, + lapack_idx n, + float *d, + float *e, + float *Z, + lapack_idx ldz, + float *work, + lapack_idx lwork, + float *rwork, + lapack_idx lrwork, + lapack_idx *info) + { + CompQ compz_ = char2compq(compz); + + Vector d_(n, d); + Vector e_(n-1, e); + Matrix Z_(n, n, Z, ldz); + + if (lwork == -1) + { + lapack_idx lwork_ = steqr3_workquery(compz_, d_, e_, Z_); + *work = lwork_; + } + + if (lrwork == -1) + { + lapack_idx lrwork_ = steqr3_rworkquery(compz_, d_, e_, Z_); + *rwork = lrwork_; + } + + if( lwork == -1 || lrwork == -1) + return; + + MemoryBlock work_(lwork, work); + MemoryBlock rwork_(lrwork, rwork); + + *info = steqr3(compz_, d_, e_, Z_, work_, rwork_); + } + + + void lapack_c_dsteqr3(char compz, + lapack_idx n, + double *d, + double *e, + double *Z, + lapack_idx ldz, + double *work, + lapack_idx lwork, + double *rwork, + lapack_idx lrwork, + lapack_idx *info) + { + CompQ compz_ = char2compq(compz); + + Vector d_(n, d); + Vector e_(n-1, e); + Matrix Z_(n, n, Z, ldz); + + if (lwork == -1) + { + lapack_idx lwork_ = steqr3_workquery(compz_, d_, e_, Z_); + *work = lwork_; + } + + if (lrwork == -1) + { + lapack_idx lrwork_ = steqr3_rworkquery(compz_, d_, e_, Z_); + *rwork = lrwork_; + } + + if( lwork == -1 || lrwork == -1) + return; + + MemoryBlock work_(lwork, work); + MemoryBlock rwork_(lrwork, rwork); + + *info = steqr3(compz_, d_, e_, Z_, work_, rwork_); + } + + void lapack_c_csteqr3(char compz, + lapack_idx n, + float *d, + float *e, + lapack_float_complex *Z, + lapack_idx ldz, + lapack_float_complex *work, + lapack_idx lwork, + float *rwork, + lapack_idx lrwork, + lapack_idx *info) + { + CompQ compz_ = char2compq(compz); + + Vector d_(n, d); + Vector e_(n-1, e); + Matrix Z_(n, n, Z, ldz); + + if (lwork == -1) + { + lapack_idx lwork_ = steqr3_workquery(compz_, d_, e_, Z_); + *work = lwork_; + } + + if (lrwork == -1) + { + lapack_idx lrwork_ = steqr3_rworkquery(compz_, d_, e_, Z_); + *rwork = lrwork_; + } + + if( lwork == -1 || lrwork == -1) + return; + + MemoryBlock work_(lwork, work); + MemoryBlock rwork_(lrwork, rwork); + + *info = steqr3(compz_, d_, e_, Z_, work_, rwork_); + } + + + void lapack_c_zsteqr3(char compz, + lapack_idx n, + double *d, + double *e, + lapack_double_complex *Z, + lapack_idx ldz, + lapack_double_complex *work, + lapack_idx lwork, + double *rwork, + lapack_idx lrwork, + lapack_idx *info) + { + CompQ compz_ = char2compq(compz); + + Vector d_(n, d); + Vector e_(n-1, e); + Matrix Z_(n, n, Z, ldz); + + if (lwork == -1) + { + lapack_idx lwork_ = steqr3_workquery(compz_, d_, e_, Z_); + *work = lwork_; + } + + if (lrwork == -1) + { + lapack_idx lrwork_ = steqr3_rworkquery(compz_, d_, e_, Z_); + *rwork = lrwork_; + } + + if( lwork == -1 || lrwork == -1) + return; + + MemoryBlock work_(lwork, work); + MemoryBlock rwork_(lrwork, rwork); + + *info = steqr3(compz_, d_, e_, Z_, work_, rwork_); + } + +} // extern "C" \ No newline at end of file diff --git a/LAPACK_CPP/src/lapack_fortran/csteqr3.f90 b/LAPACK_CPP/src/lapack_fortran/csteqr3.f90 new file mode 100644 index 000000000..eb1115aa8 --- /dev/null +++ b/LAPACK_CPP/src/lapack_fortran/csteqr3.f90 @@ -0,0 +1,31 @@ +subroutine csteqr3 ( compz, n, d, e, z, ldz, work, lwork, rwork, lrwork, info ) + use iso_c_binding, only: c_char + implicit none + ! Interface to C + interface + subroutine lapack_c_csteqr3(compz_c, n_c, d_c, e_c, z_c, ldz_c, work_c,& + lwork_c, rwork_c, lrwork_c, info_c) bind(c, name="lapack_c_csteqr3") + use iso_c_binding, only: c_char, c_float, c_float_complex, c_int + implicit none + integer(c_int), intent(in) :: n_c, ldz_c, lwork_c, lrwork_c + integer(c_int), intent(out) :: info_c + character(c_char), intent(in) :: compz_c + real(c_float), intent(inout) :: d_c(*), e_c(*), rwork_c(*) + complex(c_float_complex), intent(inout) :: z_c(ldz_c,*), work_c(*) + end subroutine lapack_c_csteqr3 + end interface + + ! Arguments + integer, intent(in) :: n, ldz, lwork, lrwork + integer, intent(out) :: info + character, intent(in) :: compz + real, intent(inout) :: d(*), e(*), rwork(*) + complex, intent(inout) :: z(ldz,*), work(*) + character(c_char) :: compz_to_c + + ! Convert characters to C characters + compz_to_c = compz + + ! Call C function + call lapack_c_csteqr3(compz_to_c, n, d, e, z, ldz, work, lwork, rwork, lrwork, info) +end subroutine csteqr3 diff --git a/LAPACK_CPP/src/lapack_fortran/dsteqr3.f90 b/LAPACK_CPP/src/lapack_fortran/dsteqr3.f90 new file mode 100644 index 000000000..d4aae1fc4 --- /dev/null +++ b/LAPACK_CPP/src/lapack_fortran/dsteqr3.f90 @@ -0,0 +1,29 @@ +subroutine dsteqr3 ( compz, n, d, e, z, ldz, work, lwork, rwork, lrwork, info ) + use iso_c_binding, only: c_char + implicit none + ! Interface to C + interface + subroutine lapack_c_dsteqr3(compz_c, n_c, d_c, e_c, z_c, ldz_c, work_c,& + lwork_c, rwork_c, lrwork_c, info_c) bind(c, name="lapack_c_dsteqr3") + use iso_c_binding, only: c_char, c_double, c_double_complex, c_int + implicit none + integer(c_int), intent(in) :: n_c, ldz_c, lwork_c, lrwork_c + integer(c_int), intent(out) :: info_c + character(c_char), intent(in) :: compz_c + real(c_double), intent(inout) :: d_c(*), e_c(*), rwork_c(*), z_c(ldz_c,*), work_c(*) + end subroutine lapack_c_dsteqr3 + end interface + + ! Arguments + integer, intent(in) :: n, ldz, lwork, lrwork + integer, intent(out) :: info + character, intent(in) :: compz + double precision, intent(inout) :: d(*), e(*), rwork(*), z(ldz,*), work(*) + character(c_char) :: compz_to_c + + ! Convert characters to C characters + compz_to_c = compz + + ! Call C function + call lapack_c_dsteqr3(compz_to_c, n, d, e, z, ldz, work, lwork, rwork, lrwork, info) +end subroutine dsteqr3 diff --git a/LAPACK_CPP/src/lapack_fortran/ssteqr3.f90 b/LAPACK_CPP/src/lapack_fortran/ssteqr3.f90 new file mode 100644 index 000000000..e40e278d9 --- /dev/null +++ b/LAPACK_CPP/src/lapack_fortran/ssteqr3.f90 @@ -0,0 +1,29 @@ +subroutine ssteqr3 ( compz, n, d, e, z, ldz, work, lwork, rwork, lrwork, info ) + use iso_c_binding, only: c_char + implicit none + ! Interface to C + interface + subroutine lapack_c_ssteqr3(compz_c, n_c, d_c, e_c, z_c, ldz_c, work_c,& + lwork_c, rwork_c, lrwork_c, info_c) bind(c, name="lapack_c_ssteqr3") + use iso_c_binding, only: c_char, c_float, c_float_complex, c_int + implicit none + integer(c_int), intent(in) :: n_c, ldz_c, lwork_c, lrwork_c + integer(c_int), intent(out) :: info_c + character(c_char), intent(in) :: compz_c + real(c_float), intent(inout) :: d_c(*), e_c(*), rwork_c(*), z_c(ldz_c,*), work_c(*) + end subroutine lapack_c_ssteqr3 + end interface + + ! Arguments + integer, intent(in) :: n, ldz, lwork, lrwork + integer, intent(out) :: info + character, intent(in) :: compz + real, intent(inout) :: d(*), e(*), rwork(*), z(ldz,*), work(*) + character(c_char) :: compz_to_c + + ! Convert characters to C characters + compz_to_c = compz + + ! Call C function + call lapack_c_ssteqr3(compz_to_c, n, d, e, z, ldz, work, lwork, rwork, lrwork, info) +end subroutine ssteqr3 diff --git a/LAPACK_CPP/src/lapack_fortran/zsteqr3.f90 b/LAPACK_CPP/src/lapack_fortran/zsteqr3.f90 new file mode 100644 index 000000000..f12321579 --- /dev/null +++ b/LAPACK_CPP/src/lapack_fortran/zsteqr3.f90 @@ -0,0 +1,31 @@ +subroutine zsteqr3 ( compz, n, d, e, z, ldz, work, lwork, rwork, lrwork, info ) + use iso_c_binding, only: c_char + implicit none + ! Interface to C + interface + subroutine lapack_c_zsteqr3(compz_c, n_c, d_c, e_c, z_c, ldz_c, work_c,& + lwork_c, rwork_c, lrwork_c, info_c) bind(c, name="lapack_c_zsteqr3") + use iso_c_binding, only: c_char, c_double, c_double_complex, c_int + implicit none + integer(c_int), intent(in) :: n_c, ldz_c, lwork_c, lrwork_c + integer(c_int), intent(out) :: info_c + character(c_char), intent(in) :: compz_c + real(c_double), intent(inout) :: d_c(*), e_c(*), rwork_c(*) + complex(c_double_complex), intent(inout) :: z_c(ldz_c,*), work_c(*) + end subroutine lapack_c_zsteqr3 + end interface + + ! Arguments + integer, intent(in) :: n, ldz, lwork, lrwork + integer, intent(out) :: info + character, intent(in) :: compz + double precision, intent(inout) :: d(*), e(*), rwork(*) + double complex, intent(inout) :: z(ldz,*), work(*) + character(c_char) :: compz_to_c + + ! Convert characters to C characters + compz_to_c = compz + + ! Call C function + call lapack_c_zsteqr3(compz_to_c, n, d, e, z, ldz, work, lwork, rwork, lrwork, info) +end subroutine zsteqr3 From 7094cb0f7c8e379ab56517e2bf529a4f6e14e6a6 Mon Sep 17 00:00:00 2001 From: thijssteel Date: Wed, 6 Mar 2024 17:07:20 +0100 Subject: [PATCH 10/10] add steqr wrappers for comparison --- LAPACK_CPP/example/profile_steqr3.cpp | 50 ++++-- LAPACK_CPP/include/lapack_c.h | 1 + LAPACK_CPP/include/lapack_c/steqr.h | 51 ++++++ LAPACK_CPP/include/lapack_c/steqr3.h | 6 +- LAPACK_CPP/include/lapack_cpp/lapack.hpp | 1 + .../include/lapack_cpp/lapack/lasr3.hpp | 7 + .../include/lapack_cpp/lapack/steqr.hpp | 25 +++ .../include/lapack_cpp/lapack/steqr3.hpp | 2 +- LAPACK_CPP/src/lapack_c/steqr.f90 | 65 ++++++++ LAPACK_CPP/src/lapack_cpp/steqr.cpp | 151 ++++++++++++++++++ 10 files changed, 340 insertions(+), 19 deletions(-) create mode 100644 LAPACK_CPP/include/lapack_c/steqr.h create mode 100644 LAPACK_CPP/include/lapack_cpp/lapack/steqr.hpp create mode 100644 LAPACK_CPP/src/lapack_c/steqr.f90 create mode 100644 LAPACK_CPP/src/lapack_cpp/steqr.cpp diff --git a/LAPACK_CPP/example/profile_steqr3.cpp b/LAPACK_CPP/example/profile_steqr3.cpp index 517dd4b86..f6807c498 100644 --- a/LAPACK_CPP/example/profile_steqr3.cpp +++ b/LAPACK_CPP/example/profile_steqr3.cpp @@ -10,7 +10,7 @@ using namespace lapack_cpp; template -void profile_steqr3(idx_t n) +void profile_steqr3(idx_t n, bool use_fortran = false) { MemoryBlock Z_(n, n, layout); Matrix Z(n, n, Z_); @@ -35,22 +35,38 @@ void profile_steqr3(idx_t n) CompQ compz = CompQ::Initialize; - MemoryBlock work(steqr3_workquery(compz, d, e, Z)); - MemoryBlock, idx_t> rwork(steqr3_rworkquery(compz, d, e, Z)); - const idx_t n_timings = 100; const idx_t n_warmup = 50; std::vector timings(n_timings); - for (idx_t i = 0; i < n_timings; ++i) + if (use_fortran) + { + MemoryBlock, idx_t> rwork(2 * n - 2); + for (idx_t i = 0; i < n_timings; ++i) + { + d = d_copy; + e = e_copy; + auto start = std::chrono::high_resolution_clock::now(); + steqr(compz, d, e, Z, rwork); + auto end = std::chrono::high_resolution_clock::now(); + timings[i] = std::chrono::duration(end - start).count(); + std::cout << i << " " << timings[i] << '\r' << std::flush; + } + } + else { - d = d_copy; - e = e_copy; - auto start = std::chrono::high_resolution_clock::now(); - steqr3(compz, d, e, Z, work, rwork); - auto end = std::chrono::high_resolution_clock::now(); - timings[i] = std::chrono::duration(end - start).count(); - std::cout< work(steqr3_workquery(compz, d, e, Z)); + MemoryBlock, idx_t> rwork(steqr3_rworkquery(compz, d, e, Z)); + for (idx_t i = 0; i < n_timings; ++i) + { + d = d_copy; + e = e_copy; + auto start = std::chrono::high_resolution_clock::now(); + steqr3(compz, d, e, Z, work, rwork); + auto end = std::chrono::high_resolution_clock::now(); + timings[i] = std::chrono::duration(end - start).count(); + std::cout << i << " " << timings[i] << '\r' << std::flush; + } } float mean = 0; @@ -63,7 +79,7 @@ void profile_steqr3(idx_t n) std_dev += (timings[i] - mean) * (timings[i] - mean); std_dev = std::sqrt(std_dev / (n_timings - n_warmup - 1)); - std::cout << "n = " << n << ", mean time = " << mean + std::cout << "n = " << n << ", using fortran: " << use_fortran << ", mean time = " << mean << " s" << ", std dev = " << std_dev / mean * 100 << " %" << std::endl; @@ -74,9 +90,13 @@ int main() typedef lapack_idx_t idx_t; typedef double T; - for (int n = 32; n <= 4000; n *= 2) + for (int n = 32; n <= 2000; n *= 2) + { + profile_steqr3(n, false); + } + for (int n = 32; n <= 2000; n *= 2) { - profile_steqr3(n); + profile_steqr3(n, true); } return 0; } \ No newline at end of file diff --git a/LAPACK_CPP/include/lapack_c.h b/LAPACK_CPP/include/lapack_c.h index 54d5fe637..f3fc0b7d4 100644 --- a/LAPACK_CPP/include/lapack_c.h +++ b/LAPACK_CPP/include/lapack_c.h @@ -12,6 +12,7 @@ #include "lapack_c/lae2.h" #include "lapack_c/laev2.h" #include "lapack_c/lasrt.h" +#include "lapack_c/steqr.h" #include "lapack_c/steqr3.h" #endif \ No newline at end of file diff --git a/LAPACK_CPP/include/lapack_c/steqr.h b/LAPACK_CPP/include/lapack_c/steqr.h new file mode 100644 index 000000000..5d8db0065 --- /dev/null +++ b/LAPACK_CPP/include/lapack_c/steqr.h @@ -0,0 +1,51 @@ +#ifndef LAPACK_C_STEQR_HPP +#define LAPACK_C_STEQR_HPP + +#include "lapack_c/util.h" + +#ifdef __cplusplus +extern "C" +{ +#endif // __cplusplus + + void lapack_c_ssteqr(char compz, + lapack_idx n, + float *d, + float *e, + float *Z, + lapack_idx ldz, + float *work, + lapack_idx *info); + + void lapack_c_dsteqr(char compz, + lapack_idx n, + double *d, + double *e, + double *Z, + lapack_idx ldz, + double *work, + lapack_idx *info); + + void lapack_c_csteqr(char compz, + lapack_idx n, + float *d, + float *e, + lapack_float_complex *Z, + lapack_idx ldz, + float *work, + lapack_idx *info); + + void lapack_c_zsteqr(char compz, + lapack_idx n, + double *d, + double *e, + lapack_double_complex *Z, + lapack_idx ldz, + double *work, + lapack_idx *info); + +#ifdef __cplusplus +} +#endif // __cplusplus + +#endif // LAPACK_C_STEQR_HPP \ No newline at end of file diff --git a/LAPACK_CPP/include/lapack_c/steqr3.h b/LAPACK_CPP/include/lapack_c/steqr3.h index 57741b9d9..b1ae85fd1 100644 --- a/LAPACK_CPP/include/lapack_c/steqr3.h +++ b/LAPACK_CPP/include/lapack_c/steqr3.h @@ -1,5 +1,5 @@ -#ifndef LAPACK_C_ROT_HPP -#define LAPACK_C_ROT_HPP +#ifndef LAPACK_C_STEQR3_HPP +#define LAPACK_C_STEQR3_HPP #include "lapack_c/util.h" @@ -60,4 +60,4 @@ extern "C" } #endif // __cplusplus -#endif \ No newline at end of file +#endif // LAPACK_C_STEQR3_HPP \ No newline at end of file diff --git a/LAPACK_CPP/include/lapack_cpp/lapack.hpp b/LAPACK_CPP/include/lapack_cpp/lapack.hpp index bb515fbf2..29b84f589 100644 --- a/LAPACK_CPP/include/lapack_cpp/lapack.hpp +++ b/LAPACK_CPP/include/lapack_cpp/lapack.hpp @@ -6,6 +6,7 @@ #include "lapack_cpp/lapack/lartg.hpp" #include "lapack_cpp/lapack/lasr3.hpp" #include "lapack_cpp/lapack/lasrt.hpp" +#include "lapack_cpp/lapack/steqr.hpp" #include "lapack_cpp/lapack/steqr3.hpp" #endif // LAPACK_CPP_LAPACK_HPP \ No newline at end of file diff --git a/LAPACK_CPP/include/lapack_cpp/lapack/lasr3.hpp b/LAPACK_CPP/include/lapack_cpp/lapack/lasr3.hpp index a1ecfa66f..b91101b14 100644 --- a/LAPACK_CPP/include/lapack_cpp/lapack/lasr3.hpp +++ b/LAPACK_CPP/include/lapack_cpp/lapack/lasr3.hpp @@ -127,6 +127,8 @@ namespace internal { TS s2) { assert(x1.size() == x2.size() && x2.size() == x3.size()); + if( c1 == real_t{1.} && s1 == TS{0.} && c2 == real_t{1.} && s2 == TS{0} ) + return; const idx_t n = x1.size(); for (idx_t i = 0; i < n; ++i) { T x2_g1 = -conj(s1) * x1[i] + c1 * x2[i]; @@ -160,6 +162,8 @@ namespace internal { TS s2) { assert(x1.size() == x2.size() && x2.size() == x3.size()); + if( c1 == real_t{1.} && s1 == TS{0.} && c2 == real_t{1.} && s2 == TS{0} ) + return; const idx_t n = x1.size(); for (idx_t i = 0; i < n; ++i) { T x2_g1 = c1 * x2[i] + s1 * x3[i]; @@ -208,6 +212,9 @@ namespace internal { { assert(x1.size() == x2.size() && x1.size() == x3.size() && x1.size() == x4.size()); + if( c1 == real_t{1.} && s1 == TS{0.} && c2 == real_t{1.} && s2 == TS{0} && + c3 == real_t{1.} && s3 == TS{0.} && c4 == real_t{1.} && s4 == TS{0} ) + return; const idx_t n = x1.size(); for (idx_t i = 0; i < n; ++i) { T x2_g1 = c1 * x2[i] + s1 * x3[i]; diff --git a/LAPACK_CPP/include/lapack_cpp/lapack/steqr.hpp b/LAPACK_CPP/include/lapack_cpp/lapack/steqr.hpp new file mode 100644 index 000000000..dd8fa211f --- /dev/null +++ b/LAPACK_CPP/include/lapack_cpp/lapack/steqr.hpp @@ -0,0 +1,25 @@ +#ifndef LAPACK_CPP_STEQR_HPP +#define LAPACK_CPP_STEQR_HPP + +#include "lapack_cpp/base.hpp" + +namespace lapack_cpp +{ + + /** + * Computes all eigenvalues and, optionally, eigenvectors of a symmetric tridiagonal matrix using the QR algorithm. + */ + template + int steqr( + CompQ compz, + Vector, idx_t> d, + Vector, idx_t> e, + Matrix Z, + MemoryBlock, idx_t, aligned> &work); + +} + +#endif // LAPACK_CPP_STEQR_HPP \ No newline at end of file diff --git a/LAPACK_CPP/include/lapack_cpp/lapack/steqr3.hpp b/LAPACK_CPP/include/lapack_cpp/lapack/steqr3.hpp index 11016f6ed..00e564f8e 100644 --- a/LAPACK_CPP/include/lapack_cpp/lapack/steqr3.hpp +++ b/LAPACK_CPP/include/lapack_cpp/lapack/steqr3.hpp @@ -498,4 +498,4 @@ namespace lapack_cpp } -#endif \ No newline at end of file +#endif // LAPACK_CPP_STEQR3_HPP \ No newline at end of file diff --git a/LAPACK_CPP/src/lapack_c/steqr.f90 b/LAPACK_CPP/src/lapack_c/steqr.f90 new file mode 100644 index 000000000..f1a8e0bf7 --- /dev/null +++ b/LAPACK_CPP/src/lapack_c/steqr.f90 @@ -0,0 +1,65 @@ +subroutine lapack_c_ssteqr(compz, n, d, e, Z, ldz, work, info) bind(c, name='lapack_c_ssteqr') + use iso_c_binding + implicit none + character(kind=c_char), intent(in), value :: compz + integer(kind=c_int), intent(in), value :: n, ldz + real(kind=c_float), intent(inout) :: d(*), e(*), Z(ldz,*) + real(kind=c_float), intent(out) :: work(*) + integer(kind=c_int), intent(out) :: info + + external ssteqr + + ! Forward call to fortran implementation + call ssteqr( compz, n, d, e, Z, ldz, work, info ) + +end subroutine lapack_c_ssteqr + +subroutine lapack_c_dsteqr(compz, n, d, e, Z, ldz, work, info) bind(c, name='lapack_c_dsteqr') + use iso_c_binding + implicit none + character(kind=c_char), intent(in), value :: compz + integer(kind=c_int), intent(in), value :: n, ldz + real(kind=c_double), intent(inout) :: d(*), e(*), Z(ldz,*) + real(kind=c_double), intent(out) :: work(*) + integer(kind=c_int), intent(out) :: info + + external dsteqr + + ! Forward call to fortran implementation + call dsteqr( compz, n, d, e, Z, ldz, work, info ) + +end subroutine lapack_c_dsteqr + +subroutine lapack_c_csteqr(compz, n, d, e, Z, ldz, work, info) bind(c, name='lapack_c_csteqr') + use iso_c_binding + implicit none + character(kind=c_char), intent(in), value :: compz + integer(kind=c_int), intent(in), value :: n, ldz + real(kind=c_float), intent(inout) :: d(*), e(*) + complex(kind=c_float_complex), intent(inout) :: Z(ldz,*) + real(kind=c_float), intent(out) :: work(*) + integer(kind=c_int), intent(out) :: info + + external csteqr + + ! Forward call to fortran implementation + call csteqr( compz, n, d, e, Z, ldz, work, info ) + +end subroutine lapack_c_csteqr + +subroutine lapack_c_zsteqr(compz, n, d, e, Z, ldz, work, info) bind(c, name='lapack_c_zsteqr') + use iso_c_binding + implicit none + character(kind=c_char), intent(in), value :: compz + integer(kind=c_int), intent(in), value :: n, ldz + real(kind=c_double), intent(inout) :: d(*), e(*) + complex(kind=c_double_complex), intent(inout) :: Z(ldz,*) + real(kind=c_double), intent(out) :: work(*) + integer(kind=c_int), intent(out) :: info + + external zsteqr + + ! Forward call to fortran implementation + call zsteqr( compz, n, d, e, Z, ldz, work, info ) + +end subroutine lapack_c_zsteqr \ No newline at end of file diff --git a/LAPACK_CPP/src/lapack_cpp/steqr.cpp b/LAPACK_CPP/src/lapack_cpp/steqr.cpp new file mode 100644 index 000000000..a484c7766 --- /dev/null +++ b/LAPACK_CPP/src/lapack_cpp/steqr.cpp @@ -0,0 +1,151 @@ +#include +#include +#include + +#include "lapack_c.h" +#include "lapack_cpp/base.hpp" +#include "lapack_cpp/lapack/steqr.hpp" + +namespace lapack_cpp +{ + + template <> + int steqr(CompQ compz, Vector d, Vector e, Matrix Z, MemoryBlock &work) + { + assert(d.size() >= 0); + assert(d.size() == e.size() + 1); + assert(d.size() == Z.num_rows()); + assert(d.size() == Z.num_columns()); + assert(2*d.size()-2 == work.size()); + // fortran code only supports stride 1 + assert(d.stride() == 1); + assert(e.stride() == 1); + lapack_idx info; + lapack_c_ssteqr((char)compz, d.size(), d.ptr(), e.ptr(), Z.ptr(), Z.ldim(), work.ptr(), &info); + + return info; + } + + template <> + int steqr(CompQ compz, Vector d, Vector e, Matrix Z, MemoryBlock &work) + { + assert(d.size() >= 0); + assert(d.size() == e.size() + 1); + assert(d.size() == Z.num_rows()); + assert(d.size() == Z.num_columns()); + assert(2*d.size()-2 == work.size()); + // fortran code only supports stride 1 + assert(d.stride() == 1); + assert(e.stride() == 1); + lapack_idx info; + lapack_c_dsteqr((char)compz, d.size(), d.ptr(), e.ptr(), Z.ptr(), Z.ldim(), work.ptr(), &info); + + return info; + } + + + template <> + int steqr(CompQ compz, Vector d, Vector e, Matrix, Layout::ColMajor, int> Z, MemoryBlock &work) + { + assert(d.size() >= 0); + assert(d.size() == e.size() + 1); + assert(d.size() == Z.num_rows()); + assert(d.size() == Z.num_columns()); + assert(2*d.size()-2 == work.size()); + // fortran code only supports stride 1 + assert(d.stride() == 1); + assert(e.stride() == 1); + lapack_idx info; + lapack_c_csteqr((char)compz, d.size(), d.ptr(), e.ptr(), Z.ptr(), Z.ldim(), work.ptr(), &info); + + return info; + } + + template <> + int steqr(CompQ compz, Vector d, Vector e, Matrix, Layout::ColMajor, int> Z, MemoryBlock &work) + { + assert(d.size() >= 0); + assert(d.size() == e.size() + 1); + assert(d.size() == Z.num_rows()); + assert(d.size() == Z.num_columns()); + assert(2*d.size()-2 == work.size()); + // fortran code only supports stride 1 + assert(d.stride() == 1); + assert(e.stride() == 1); + lapack_idx info; + lapack_c_zsteqr((char)compz, d.size(), d.ptr(), e.ptr(), Z.ptr(), Z.ldim(), work.ptr(), &info); + + return info; + } + + + template <> + int steqr(CompQ compz, Vector d, Vector e, Matrix Z, MemoryBlock &work) + { + assert(d.size() >= 0); + assert(d.size() == e.size() + 1); + assert(d.size() == Z.num_rows()); + assert(d.size() == Z.num_columns()); + assert(2*d.size()-2 == work.size()); + // fortran code only supports stride 1 + assert(d.stride() == 1); + assert(e.stride() == 1); + lapack_idx info; + lapack_c_ssteqr((char)compz, d.size(), d.ptr(), e.ptr(), Z.ptr(), Z.ldim(), work.ptr(), &info); + + return info; + } + + template <> + int steqr(CompQ compz, Vector d, Vector e, Matrix Z, MemoryBlock &work) + { + assert(d.size() >= 0); + assert(d.size() == e.size() + 1); + assert(d.size() == Z.num_rows()); + assert(d.size() == Z.num_columns()); + assert(2*d.size()-2 == work.size()); + // fortran code only supports stride 1 + assert(d.stride() == 1); + assert(e.stride() == 1); + lapack_idx info; + lapack_c_dsteqr((char)compz, d.size(), d.ptr(), e.ptr(), Z.ptr(), Z.ldim(), work.ptr(), &info); + + return info; + } + + + template <> + int steqr(CompQ compz, Vector d, Vector e, Matrix, Layout::ColMajor, int64_t> Z, MemoryBlock &work) + { + assert(d.size() >= 0); + assert(d.size() == e.size() + 1); + assert(d.size() == Z.num_rows()); + assert(d.size() == Z.num_columns()); + assert(2*d.size()-2 == work.size()); + // fortran code only supports stride 1 + assert(d.stride() == 1); + assert(e.stride() == 1); + lapack_idx info; + lapack_c_csteqr((char)compz, d.size(), d.ptr(), e.ptr(), Z.ptr(), Z.ldim(), work.ptr(), &info); + + return info; + } + + template <> + int steqr(CompQ compz, Vector d, Vector e, Matrix, Layout::ColMajor, int64_t> Z, MemoryBlock &work) + { + assert(d.size() >= 0); + assert(d.size() == e.size() + 1); + assert(d.size() == Z.num_rows()); + assert(d.size() == Z.num_columns()); + assert(2*d.size()-2 == work.size()); + // fortran code only supports stride 1 + assert(d.stride() == 1); + assert(e.stride() == 1); + lapack_idx info; + lapack_c_zsteqr((char)compz, d.size(), d.ptr(), e.ptr(), Z.ptr(), Z.ldim(), work.ptr(), &info); + + return info; + } + +} // namespace lapack_cpp \ No newline at end of file