diff --git a/src/linearAlgebra/AbstractMatrix.h b/src/linearAlgebra/AbstractMatrix.h index 2d8ccbecc..71e7d4739 100644 --- a/src/linearAlgebra/AbstractMatrix.h +++ b/src/linearAlgebra/AbstractMatrix.h @@ -40,6 +40,93 @@ namespace dftefe template class AbstractMatrix { + public: + /** + * @brief The value setter interface for the matrix class. This setter + * assumes the given pointer to ValueType to be a serially repeated + * matrix and copy the data to the corresponding local owned + * location trivially. + * @warning There is no boundary check in this function. The user should + * be responsible that the size of data array should have d_m*d_n + * size. + * @param data The pointer to the data to be copied from. + */ + virtual void + setValues(const ValueType *data) = 0; + + /** + * @brief The value setter interface for the matrix class. This setter + * assumes the given pointer to ValueType to be a serially repeated + * matrix and copy the data to the corresponding local owned + * sub-matrix (i1:i2-1, j1:j2-1) trivially. + * @warning There is no boundary check in this function. The user should + * be responsible that the size of data array should have + * (j2-j1)*(i2-i1) size. + * @param i1 the global index of the first row of the submatrix. + * @param i2 one passes the global index of the last row of the submatrix. + * @param j1 the global index of the first column of the submatrix. + * @param j2 one passes the global index of the last column of the + * submatrix. + * @param data The pointer to the data to be copied from. The user should + * be responsible that the size of data array should have + * (j2-j1)*(i2-i1) size. + */ + virtual void + setValues(size_t i1, + size_t i2, + size_t j1, + size_t j2, + const ValueType *data) = 0; + + // /** + // * @brief The value setter interface for the matrix class. This setter + // * allows data to be different on each processor. The + // locally owned + // * data contains two parts: (1) values belongs to the + // locally owned + // * portion of the matrix and (2) values belong to the + // off-processor + // * portion of the matrix. This setter will distribute the + // * off-processor values correspondingly. + // * @warning This routine currently is not optimized and fully + // * tested. Using this routing to assign values can + // * deteriorate the performance dramatically. This should + // * only be used for debugging purpose or not performance + // * relevant part of the code. + // * @param i1 the global index of the first row of the submatrix. + // * @param i2 one passes the global index of the last row of the submatrix. + // * @param j1 the global index of the first column of the submatrix. + // * @param j2 one passes the global index of the last column of the + // * submatrix. + // * @param data The pointer to the data to be copied from. The user should + // * be responsible that the size of data array should + // have + // * (j2-j1)*(i2-i1) size. + // */ + // virtual void + // setDistributedValues(size_t i1, + // size_t i2, + // size_t j1, + // size_t j2, + // const ValueType *data) = 0; + // + // /** + // * @brief The value setter interface for inserting a single value. + // * @warning This routine is sub-optimal and can seriously deteriorate the + // * the performance. It should be only used when + // necessary. Only + // * the processor which owns (i, j) element will be + // inserting + // * value. + // * @param i the row index of the value. + // * @param j the column index of the value. + // * @param d the value. + // */ + // virtual void + // setValue(size_t i, size_t j, ValueType d) = 0; + + AbstractMatrix() = delete; + protected: AbstractMatrix(size_t m, size_t n, @@ -51,7 +138,24 @@ namespace dftefe size_t d_m, d_n; size_t d_mb, d_nb, d_p, d_q; MPI_Comm d_comm; + int d_num_ranks; slate::BaseMatrix *d_baseMatrix = nullptr; + + void + setValueSlateMatrix(slate::BaseMatrix *matrix, + const ValueType * data); + + void + setValueSlateMatrix(slate::BaseMatrix *matrix, + const ValueType * data, + int i1, + int i2, + int j1, + int j2); + + + void + initSlateMatrix(slate::BaseMatrix *matrix, ValueType val = 0); }; } // namespace linearAlgebra diff --git a/src/linearAlgebra/AbstractMatrix.t.cpp b/src/linearAlgebra/AbstractMatrix.t.cpp index 25a39b375..30223453c 100644 --- a/src/linearAlgebra/AbstractMatrix.t.cpp +++ b/src/linearAlgebra/AbstractMatrix.t.cpp @@ -23,6 +23,8 @@ * @author Ian C. Lin. */ +#include "AbstractMatrix.h" + namespace dftefe { namespace linearAlgebra @@ -44,6 +46,95 @@ namespace dftefe , d_mb(mb) {} + template + void + AbstractMatrix::setValues(const ValueType *data) + { + setValueSlateMatrix(d_baseMatrix, data); + } + + template + void + AbstractMatrix::initSlateMatrix( + slate::BaseMatrix *matrix, + ValueType val) + { + for (int64_t j = 0, j_offset = 0; j < matrix->nt(); + j_offset += matrix->tileNb(j++)) + { + for (int64_t i = 0, i_offset = 0; i < matrix->mt(); + i_offset += matrix->tileMb(i++)) + { + if (matrix->tileIsLocal(i, j)) + { + matrix->at(i, j).set(val); + } + } + } + } + template + void + AbstractMatrix::setValueSlateMatrix( + slate::BaseMatrix *matrix, + const ValueType * data) + { + for (int64_t j = 0, j_offset = 0; j < matrix->nt(); + j_offset += matrix->tileNb(j++)) + { + for (int64_t i = 0, i_offset = 0; i < matrix->mt(); + i_offset += matrix->tileMb(i++)) + { + if (matrix->tileIsLocal(i, j)) + { + slate::Tile T = (*matrix)(i, j); + // todo: check for transpose case (d_m and d_n) + int64_t mb = T.mb(), nb = T.nb(), + offset = i_offset + j_offset * d_m; + lapack::lacpy(lapack::MatrixType::General, + mb, + nb, + &data[offset], + d_m, + T.data(), + mb); + } + } + } + } + + template + void + AbstractMatrix::setValueSlateMatrix( + slate::BaseMatrix *matrix, + const ValueType * data, + int i1, + int i2, + int j1, + int j2) + { + for (int64_t j = 0, j_offset = 0; j < matrix->nt(); + j_offset += matrix->tileNb(j++)) + { + for (int64_t i = 0, i_offset = 0; i < matrix->mt(); + i_offset += matrix->tileMb(i++)) + { + if (matrix->tileIsLocal(i, j)) + { + slate::Tile T = (*matrix)(i, j); + // todo: check for transpose case (d_m and d_n) + int64_t mb = T.mb(), nb = T.nb(), + offset = i_offset + j_offset * d_m; + int64_t ii1 = lapack::lacpy(lapack::MatrixType::General, + mb, + nb, + &data[offset], + d_m, + T.data(), + mb); + } + } + } + } } // namespace linearAlgebra } // namespace dftefe diff --git a/src/linearAlgebra/CMakeLists.txt b/src/linearAlgebra/CMakeLists.txt index 0b19d7f6f..5eb545681 100644 --- a/src/linearAlgebra/CMakeLists.txt +++ b/src/linearAlgebra/CMakeLists.txt @@ -13,20 +13,29 @@ if (NOT TARGET dft-efe-linalg) QueueManager.cpp LinAlgOpContext.cpp VectorAttributes.cpp - Vector.cpp - MultiVector.cpp + Vector.cpp + MultiVector.cpp AbstractMatrix.cpp GeneralMatrix.cpp HermitianMatrix.cpp TriangularMatrix.cpp MatrixKernels.cu MatrixKernels.cpp - MatrixOperations.cpp) + MatrixOperations.cpp + MatVecOperations.cpp) add_library(dft-efe-linalg SHARED ${DFT-EFE-LINALG-SOURCES}) include_directories(../) + find_package(deal.II 9.3.0 REQUIRED HINTS ${DEALII_PATH}) + target_include_directories(dft-efe-linalg PUBLIC ${DEAL_II_INCLUDE_DIRS}) + IF("${CMAKE_BUILD_TYPE}" STREQUAL "Release") + target_link_libraries (dft-efe-linalg PUBLIC ${DEAL_II_LIBRARIES_RELEASE}) + ELSE() + target_link_libraries (dft-efe-linalg PUBLIC ${DEAL_II_LIBRARIES_DEBUG}) + ENDIF() + if (ENABLE_MPI) add_compile_definitions(DFTEFE_WITH_MPI) if (NOT MPI_FOUND) @@ -55,5 +64,5 @@ if (NOT TARGET dft-efe-linalg) target_include_directories(dft-efe-linalg PUBLIC ${SLATE_DIR}/include) target_link_directories(dft-efe-linalg PUBLIC ${SLATE_DIR}/lib64) - target_link_libraries(dft-efe-linalg PUBLIC blaspp slate dft-efe-utils ${DFTEFE_LINEARALGEBRA_MPI_LIBRARIES}) + target_link_libraries(dft-efe-linalg PUBLIC blaspp lapackpp slate gomp dft-efe-utils ${DFTEFE_LINEARALGEBRA_MPI_LIBRARIES}) endif () diff --git a/src/linearAlgebra/DO_NOT_USE_ScalapckWrapper/README b/src/linearAlgebra/DO_NOT_USE_ScalapckWrapper/README new file mode 100644 index 000000000..ac6d23e1c --- /dev/null +++ b/src/linearAlgebra/DO_NOT_USE_ScalapckWrapper/README @@ -0,0 +1,4 @@ +This directory is adapted from Scalapack wrapper in DFT-FE and Deal.ii code. It contains the scalapack wrapper when we +really need a fall back option if SLATE does not work. + +This project currently does NOT support Scalapck. \ No newline at end of file diff --git a/src/linearAlgebra/DO_NOT_USE_ScalapckWrapper/lapack_support.h b/src/linearAlgebra/DO_NOT_USE_ScalapckWrapper/lapack_support.h new file mode 100644 index 000000000..614671841 --- /dev/null +++ b/src/linearAlgebra/DO_NOT_USE_ScalapckWrapper/lapack_support.h @@ -0,0 +1,233 @@ +// --------------------------------------------------------------------- +// +// Copyright (C) 2005 - 2018 by the deal.II authors +// +// This file is part of the deal.II library. +// +// The deal.II library is free software; you can use it, redistribute +// it, and/or modify it under the terms of the GNU Lesser General +// Public License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// The full text of the license can be found in the file LICENSE.md at +// the top level directory of deal.II. +// +// --------------------------------------------------------------------- + +#ifndef dftfe_lapack_support_h +#define dftfe_lapack_support_h + + +#include +namespace dftefeScalapack +{ + namespace types + { +#ifdef LAPACK_WITH_64BIT_BLAS_INDICES + /** + * Integer type in BLAS. + */ + using blas_int = long long; +#else + /** + * Integer type in BLAS. + */ + using blas_int = int; +#endif + } // namespace types + + /** + * A namespace containing constants, exceptions, enumerations, and other + * utilities used by the deal.II LAPACK bindings. + */ + namespace LAPACKSupport + { + /** + * Most of the LAPACK functions one can apply to a matrix (e.g., by calling + * the member functions of this class) change its content in some ways. For + * example, they may invert the matrix, or may replace it by a matrix whose + * columns represent the eigenvectors of the original content of the matrix. + * The elements of this enumeration are therefore used to track what is + * currently being stored by this object. + */ + enum State + { + /// Contents is actually a matrix. + matrix, + /// Contents is the inverse of a matrix. + inverse_matrix, + /// Contents is an LU decomposition. + lu, + /// Contents is a Cholesky decomposition. + cholesky, + /// Eigenvalue vector is filled + eigenvalues, + /// Matrix contains singular value decomposition, + svd, + /// Matrix is the inverse of a singular value decomposition + inverse_svd, + /// Contents is something useless. + unusable = 0x8000 + }; + + /** + * %Function printing the name of a State. + */ + inline const char * + state_name(State s) + { + switch (s) + { + case matrix: + return "matrix"; + case inverse_matrix: + return "inverse matrix"; + case lu: + return "lu decomposition"; + case cholesky: + return "cholesky decomposition"; + case eigenvalues: + return "eigenvalues"; + case svd: + return "svd"; + case inverse_svd: + return "inverse_svd"; + case unusable: + return "unusable"; + default: + return "unknown"; + } + } + + /** + * A matrix can have certain features allowing for optimization, but hard to + * test. These are listed here. + */ + enum Property + { + /// No special properties + general = 0, + /// Matrix is symmetric + hermitian = 1, + /// Matrix is upper triangular + upper_triangular = 2, + /// Matrix is lower triangular + lower_triangular = 4, + /// Matrix is diagonal + diagonal = 6, + /// Matrix is in upper Hessenberg form + hessenberg = 8 + }; + + /** + * %Function printing the name of a Property. + */ + inline const char * + property_name(const Property s) + { + switch (s) + { + case general: + return "general"; + case hermitian: + return "hermitian"; + case upper_triangular: + return "upper triangular"; + case lower_triangular: + return "lower triangular"; + case diagonal: + return "diagonal"; + case hessenberg: + return "Hessenberg"; + } + + Assert(false, dealii::ExcNotImplemented()); + return "invalid"; + } + + /** + * Character constant. + */ + static const char A = 'A'; + /** + * Character constant. + */ + static const char N = 'N'; + /** + * Character constant. + */ + static const char O = 'O'; + /** + * Character constant. + */ + static const char T = 'T'; + /** + * Character constant for conjugate transpose. + */ + static const char C = 'C'; + /** + * Character constant. + */ + static const char U = 'U'; + /** + * Character constant. + */ + static const char L = 'L'; + /** + * Character constant. + */ + static const char V = 'V'; + /** + * Integer constant. + */ + static const types::blas_int zero = 0; + /** + * Integer constant. + */ + static const types::blas_int one = 1; + + /** + * A LAPACK function returned an error code. + */ + DeclException2(ExcErrorCode, + std::string, + types::blas_int, + << "The function " << arg1 << " returned with an error code " + << arg2); + + /** + * Exception thrown when a matrix is not in a suitable state for an + * operation. For instance, a LAPACK routine may have left the matrix in an + * unusable state, then vmult does not make sense anymore. + */ + DeclException1( + ExcState, + State, + << "The function cannot be called while the matrix is in state " + << state_name(arg1)); + + /** + * Exception thrown when a matrix does not have suitable properties for an + * operation. + */ + DeclException1(ExcProperty, + Property, + << "The function cannot be called with a " + << property_name(arg1) << " matrix."); + + /** + * This exception is thrown if a certain LAPACK function is not available + * because no LAPACK installation was detected during configuration. + */ + DeclException1( + ExcMissing, + std::string, + << "When you ran 'cmake' during installation of deal.II, " + << "no suitable installation of the BLAS or LAPACK library could " + << "be found. Consequently, the function <" << arg1 + << "> can not be called. Refer to the doc/readme.html " + << "file for information on how to ensure that deal.II " + << "picks up an existing BLAS and LAPACK installation at " + << "configuration time."); + } // namespace LAPACKSupport +} // namespace dftefeScalapack +#endif diff --git a/src/linearAlgebra/DO_NOT_USE_ScalapckWrapper/process_grid.cpp b/src/linearAlgebra/DO_NOT_USE_ScalapckWrapper/process_grid.cpp new file mode 100644 index 000000000..0430a6045 --- /dev/null +++ b/src/linearAlgebra/DO_NOT_USE_ScalapckWrapper/process_grid.cpp @@ -0,0 +1,273 @@ +// --------------------------------------------------------------------- +// +// Copyright (C) 2017 - 2019 by the deal.II authors +// +// This file is part of the deal.II library. +// +// The deal.II library is free software; you can use it, redistribute +// it, and/or modify it under the terms of the GNU Lesser General +// Public License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// The full text of the license can be found in the file LICENSE.md at +// the top level directory of deal.II. +// +// --------------------------------------------------------------------- + +#include "process_grid.h" +#include "scalapack.templates.h" +#include + +namespace dftefeScalapack +{ + namespace + { + /** + * Internal function to determine dimension of process grid based on the + * total number of cores, matrix dimensions and matrix block sizes + * + * Amesos heuristics: + * https://github.com/trilinos/Trilinos/blob/master/packages/amesos/src/Amesos_Scalapack.cpp#L142-L166 + * + * Elemental default grid: El::Grid::Grid(mpi_comm,...) + * https://github.com/elemental/Elemental/blob/master/src/core/Grid.cpp#L67-L91 + */ + inline std::pair + compute_processor_grid_sizes(const MPI_Comm & mpi_comm, + const unsigned int m, + const unsigned int n, + const unsigned int block_size_m, + const unsigned int block_size_n) + { + // Few notes from the ScaLAPACK user guide: + // It is possible to predict the best grid shape given the number of + // processes available: Pr x Pc <= P This, however, depends on the task to + // be done. LU , QR and QL factorizations perform better for “flat” + // process grids (Pr < Pc ) For large N, Pc = 2*Pr is a good choice, + // whereas for small N, one should choose small Pr Square or near square + // grids are more optimal for Cholesky factorization. LQ and RQ + // factorizations take advantage of “tall” grids (Pr > Pc ) + + // Below we always try to create 2D processor grids: + + const int n_processes = dealii::Utilities::MPI::n_mpi_processes(mpi_comm); + + // Get the total number of cores we can occupy in a rectangular dense + // matrix with rectangular blocks when every core owns only a single + // block: + const int n_processes_heuristic = + int(std::ceil((1. * m) / block_size_m)) * + int(std::ceil((1. * n) / block_size_n)); + const int Np = std::min(n_processes_heuristic, n_processes); + + // Now we need to split Np into Pr x Pc. Assume we know the shape/ratio + // Pc =: ratio * Pr + // therefore + // Np = Pc * Pc / ratio + // for quadratic matrices the ratio equals 1 + const double ratio = double(n) / m; + int Pc = static_cast(std::sqrt(ratio * Np)); + + // one could rounds up Pc to the number which has zero remainder from the + // division of Np while ( Np % Pc != 0 ) + // ++Pc; + // but this affects the grid shape dramatically, i.e. 10 cores 3x3 becomes + // 2x5. + + // limit our estimate to be in [2, Np] + int n_process_columns = std::min(Np, std::max(2, Pc)); + // finally, get the rows: + int n_process_rows = Np / n_process_columns; + + Assert(n_process_columns >= 1 && n_process_rows >= 1 && + n_processes >= n_process_rows * n_process_columns, + dealii::ExcMessage( + "error in process grid: " + std::to_string(n_process_rows) + + "x" + std::to_string(n_process_columns) + "=" + + std::to_string(n_process_rows * n_process_columns) + " out of " + + std::to_string(n_processes))); + + return std::make_pair(n_process_rows, n_process_columns); + + // For example, + // 320x320 with 32x32 blocks and 16 cores: + // Pc = 1.0 * Pr => 4x4 grid + // Pc = 0.5 * Pr => 8x2 grid + // Pc = 2.0 * Pr => 3x5 grid + } + } // namespace + + + ProcessGrid::ProcessGrid( + const MPI_Comm & mpi_comm, + const std::pair &grid_dimensions) + : mpi_communicator(mpi_comm) + , this_mpi_process( + dealii::Utilities::MPI::this_mpi_process(mpi_communicator)) + , n_mpi_processes(dealii::Utilities::MPI::n_mpi_processes(mpi_communicator)) + , n_process_rows(grid_dimensions.first) + , n_process_columns(grid_dimensions.second) + { + Assert(grid_dimensions.first > 0, + dealii::ExcMessage( + "Number of process grid rows has to be positive.")); + Assert(grid_dimensions.second > 0, + dealii::ExcMessage( + "Number of process grid columns has to be positive.")); + + Assert( + grid_dimensions.first * grid_dimensions.second <= n_mpi_processes, + dealii::ExcMessage( + "Size of process grid is larger than number of available MPI processes.")); + + // processor grid order. + const bool column_major = false; + + // Initialize Cblas context from the provided communicator + blacs_context = Csys2blacs_handle(mpi_communicator); + const char *order = (column_major ? "Col" : "Row"); + // Note that blacs_context can be modified below. Thus Cblacs2sys_handle + // may not return the same MPI communicator. + Cblacs_gridinit(&blacs_context, order, n_process_rows, n_process_columns); + + // Blacs may modify the grid size on processes which are not used + // in the grid. So provide copies below: + int procrows_ = n_process_rows; + int proccols_ = n_process_columns; + Cblacs_gridinfo(blacs_context, + &procrows_, + &proccols_, + &this_process_row, + &this_process_column); + + // If this MPI core is not on the grid, flag it as inactive and + // skip all jobs + // Note that a different condition is used in FORTRAN code here + // https://stackoverflow.com/questions/18516915/calling-blacs-with-more-processes-than-used + if (this_process_row < 0 || this_process_column < 0) + mpi_process_is_active = false; + else + mpi_process_is_active = true; + + // Create an auxiliary communicator which has root and all inactive cores. + // Assume that inactive cores start with + // id=n_process_rows*n_process_columns + const unsigned int n_active_mpi_processes = + n_process_rows * n_process_columns; + Assert(mpi_process_is_active || this_mpi_process >= n_active_mpi_processes, + dealii::ExcInternalError()); + + std::vector inactive_with_root_ranks; + inactive_with_root_ranks.push_back(0); + for (unsigned int i = n_active_mpi_processes; i < n_mpi_processes; ++i) + inactive_with_root_ranks.push_back(i); + + // Get the group of processes in mpi_communicator + int ierr = 0; + MPI_Group all_group; + ierr = MPI_Comm_group(mpi_communicator, &all_group); + AssertThrowMPI(ierr); + + // Construct the group containing all ranks we need: + MPI_Group inactive_with_root_group; + const int n = inactive_with_root_ranks.size(); + ierr = MPI_Group_incl(all_group, + n, + inactive_with_root_ranks.data(), + &inactive_with_root_group); + AssertThrowMPI(ierr); + + // Create the communicator based on inactive_with_root_group. + // Note that on all the active MPI processes (except for the one with + // rank 0) the resulting MPI_Comm mpi_communicator_inactive_with_root + // will be MPI_COMM_NULL. + const int mpi_tag = + dealii::Utilities::MPI::internal::Tags::process_grid_constructor; + + ierr = dealii::Utilities::MPI::create_group( + mpi_communicator, + inactive_with_root_group, + mpi_tag, + &mpi_communicator_inactive_with_root); + AssertThrowMPI(ierr); + + ierr = MPI_Group_free(&all_group); + AssertThrowMPI(ierr); + ierr = MPI_Group_free(&inactive_with_root_group); + AssertThrowMPI(ierr); + + // Double check that the process with rank 0 in subgroup is active: +#ifdef DEBUG + if (mpi_communicator_inactive_with_root != MPI_COMM_NULL && + dealii::Utilities::MPI::this_mpi_process( + mpi_communicator_inactive_with_root) == 0) + Assert(mpi_process_is_active, dealii::ExcInternalError()); +#endif + } + + + + ProcessGrid::ProcessGrid(const MPI_Comm & mpi_comm, + const unsigned int n_rows_matrix, + const unsigned int n_columns_matrix, + const unsigned int row_block_size, + const unsigned int column_block_size) + : ProcessGrid(mpi_comm, + compute_processor_grid_sizes(mpi_comm, + n_rows_matrix, + n_columns_matrix, + row_block_size, + column_block_size)) + {} + + + + ProcessGrid::ProcessGrid(const MPI_Comm & mpi_comm, + const unsigned int n_rows, + const unsigned int n_columns) + : ProcessGrid(mpi_comm, std::make_pair(n_rows, n_columns)) + {} + + + + ProcessGrid::~ProcessGrid() + { + if (mpi_process_is_active) + Cblacs_gridexit(blacs_context); + + if (mpi_communicator_inactive_with_root != MPI_COMM_NULL) + MPI_Comm_free(&mpi_communicator_inactive_with_root); + } + + + + template + void + ProcessGrid::send_to_inactive(NumberType *value, const int count) const + { + Assert(count > 0, dealii::ExcInternalError()); + if (mpi_communicator_inactive_with_root != MPI_COMM_NULL) + { + const int ierr = + MPI_Bcast(value, + count, + dealii::Utilities::MPI::internal::mpi_type_id(value), + 0 /*from root*/, + mpi_communicator_inactive_with_root); + AssertThrowMPI(ierr); + } + } +} // namespace dftefeScalapack + +// instantiations + +template void +dftefeScalapack::ProcessGrid::send_to_inactive(double *, + const int) const; +template void +dftefeScalapack::ProcessGrid::send_to_inactive(float *, const int) const; +template void +dftefeScalapack::ProcessGrid::send_to_inactive(int *, const int) const; +template void +dftefeScalapack::ProcessGrid::send_to_inactive>( + std::complex *, + const int) const; diff --git a/src/linearAlgebra/DO_NOT_USE_ScalapckWrapper/process_grid.h b/src/linearAlgebra/DO_NOT_USE_ScalapckWrapper/process_grid.h new file mode 100644 index 000000000..ca68bdaca --- /dev/null +++ b/src/linearAlgebra/DO_NOT_USE_ScalapckWrapper/process_grid.h @@ -0,0 +1,253 @@ +// --------------------------------------------------------------------- +// +// Copyright (C) 2017 - 2019 by the deal.II authors +// +// This file is part of the deal.II library. +// +// The deal.II library is free software; you can use it, redistribute +// it, and/or modify it under the terms of the GNU Lesser General +// Public License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// The full text of the license can be found in the file LICENSE.md at +// the top level directory of deal.II. +// +// --------------------------------------------------------------------- + +#ifndef dftfe_process_grid_h +#define dftfe_process_grid_h + +#include +#include + + +namespace dftefeScalapack +{ +// Forward declaration of class ScaLAPACKMatrix for ProcessGrid +#ifndef DOXYGEN + template + class ScaLAPACKMatrix; +#endif + + /** + * A class taking care of setting up a two-dimensional processor grid. + * For example an MPI communicator with 5 processes can be arranged into a + * 2x2 grid with the 5-th processor being inactive: + * @code + * | 0 | 1 + * -----| ------- |----- + * 0 | P0 | P1 + * | | + * -----| ------- |----- + * 1 | P2 | P3 + * @endcode + * + * A shared pointer to this class is provided to ScaLAPACKMatrix matrices to + * perform block-cyclic distribution. + * + * Note that this class allows to setup a process grid which has fewer + * MPI cores than the total number of cores in the communicator. + * + * Currently the only place where one would use a ProcessGrid object is + * in connection with a ScaLAPACKMatrix object. + */ + class ProcessGrid + { + public: + // Declare class ScaLAPACK as friend to provide access to private members. + template + friend class dftefeScalapack::ScaLAPACKMatrix; + + /** + * Constructor for a process grid with @p n_rows and @p n_columns for a given @p mpi_communicator. + * The product of rows and columns should be less or equal to the total + * number of cores + * in the @p mpi_communicator. + */ + ProcessGrid(const MPI_Comm & mpi_communicator, + const unsigned int n_rows, + const unsigned int n_columns); + + /** + * Constructor for a process grid for a given @p mpi_communicator. + * In this case the process grid is heuristically chosen based on the + * dimensions and block-cyclic distribution of a target matrix provided + * in @p n_rows_matrix, @p n_columns_matrix, @p row_block_size and @p column_block_size. + * + * The maximum number of MPI cores one can utilize is + * $\min\{\frac{M}{MB}\frac{N}{NB}, Np\}$, where $M,N$ are the matrix + * dimension and $MB,NB$ are the block sizes and $Np$ is the number of + * processes in the @p mpi_communicator. This function then creates a 2D processor grid + * assuming the ratio between number of process row $p$ and columns $q$ to + * be equal the ratio between matrix dimensions $M$ and $N$. + * + * For example, a square matrix $640x640$ with the block size $32$ + * and the @p mpi_communicator with 11 cores will result in the $3x3$ + * process grid. + */ + ProcessGrid(const MPI_Comm & mpi_communicator, + const unsigned int n_rows_matrix, + const unsigned int n_columns_matrix, + const unsigned int row_block_size, + const unsigned int column_block_size); + + /** + * Destructor. + */ + ~ProcessGrid(); + + /** + * Return the number of rows in the processes grid. + */ + unsigned int + get_process_grid_rows() const; + + /** + * Return the number of columns in the processes grid. + */ + unsigned int + get_process_grid_columns() const; + + /** + * Return row of this process in the process grid. + * + * It's negative for inactive processes. + */ + int + get_this_process_row() const; + + /** + * Return column of this process in the process grid. + * + * It's negative for inactive processes. + */ + int + get_this_process_column() const; + + /** + * Send @p count values stored consequently starting at @p value from + * the process with rank zero to processes which + * are not in the process grid. + */ + template + void + send_to_inactive(NumberType *value, const int count = 1) const; + + /** + * Return true if the process is active within the grid. + */ + bool + is_process_active() const; + + private: + /** + * A private constructor which takes grid dimensions as an + * std::pair. + */ + ProcessGrid(const MPI_Comm & mpi_communicator, + const std::pair &grid_dimensions); + + /** + * An MPI communicator with all processes (active and inactive). + */ + MPI_Comm mpi_communicator; + + /** + * An MPI communicator with inactive processes and the process with rank + * zero. + */ + MPI_Comm mpi_communicator_inactive_with_root; + + /** + * BLACS context. This is equivalent to MPI communicators and is + * used by ScaLAPACK. + */ + int blacs_context; + + /** + * Rank of this MPI process. + */ + const unsigned int this_mpi_process; + + /** + * Total number of MPI processes. + */ + const unsigned int n_mpi_processes; + + /** + * Number of rows in the processes grid. + */ + int n_process_rows; + + /** + * Number of columns in the processes grid. + */ + int n_process_columns; + + /** + * Row of this process in the grid. + * + * It's negative for in-active processes. + */ + int this_process_row; + + /** + * Column of this process in the grid. + * + * It's negative for in-active processes. + */ + int this_process_column; + + /** + * A flag which is true for processes within the 2D process grid. + */ + bool mpi_process_is_active; + }; + + /*--------------------- Inline functions --------------------------------*/ + +#ifndef DOXYGEN + + inline unsigned int + ProcessGrid::get_process_grid_rows() const + { + return n_process_rows; + } + + + + inline unsigned int + ProcessGrid::get_process_grid_columns() const + { + return n_process_columns; + } + + + + inline int + ProcessGrid::get_this_process_row() const + { + return this_process_row; + } + + + + inline int + ProcessGrid::get_this_process_column() const + { + return this_process_column; + } + + + + inline bool + ProcessGrid::is_process_active() const + { + return mpi_process_is_active; + } + + +#endif // ifndef DOXYGEN + +} // namespace dftefeScalapack + +#endif diff --git a/src/linearAlgebra/DO_NOT_USE_ScalapckWrapper/scalapack.templates.h b/src/linearAlgebra/DO_NOT_USE_ScalapckWrapper/scalapack.templates.h new file mode 100644 index 000000000..1c0fec41f --- /dev/null +++ b/src/linearAlgebra/DO_NOT_USE_ScalapckWrapper/scalapack.templates.h @@ -0,0 +1,2803 @@ +// --------------------------------------------------------------------- +// +// Copyright (C) 2017 - 2019 by the deal.II authors +// +// This file is part of the deal.II library. +// +// The deal.II library is free software; you can use it, redistribute +// it, and/or modify it under the terms of the GNU Lesser General +// Public License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// The full text of the license can be found in the file LICENSE.md at +// the top level directory of deal.II. +// +// --------------------------------------------------------------------- + +#ifndef dftfe_scalapack_templates_h +#define dftfe_scalapack_templates_h + + +#include +#include +#include +#include + +#ifdef DEAL_II_HAVE_FP_EXCEPTIONS +# include +#endif + +// useful examples: +// https://stackoverflow.com/questions/14147705/cholesky-decomposition-scalapack-error/14203864 +// http://icl.cs.utk.edu/lapack-forum/viewtopic.php?t=139 // second post by +// Julien Langou +// https://andyspiros.wordpress.com/2011/07/08/an-example-of-blacs-with-c/ +// http://qboxcode.org/trac/browser/qb/tags/rel1_63_4/src/Matrix.C +// https://gitlab.phys.ethz.ch/lwossnig/lecture/blob/a534f562dfb2ad5c564abe5c2356d5d956fb7218/examples/mpi/scalapack.cpp +// https://github.com/elemental/Elemental/blob/master/src/core/imports/scalapack.cpp +// https://scicomp.stackexchange.com/questions/7766/performance-optimization-or-tuning-possible-for-scalapack-gemm +// +// info: +// http://www.netlib.org/scalapack/slug/index.html // User guide +// http://www.netlib.org/scalapack/slug/node135.html // How to Measure Errors +// NOTE: Non-templated functions are chosen over templated functions if their +// names match and template function is not explicitly called +namespace dftefeScalapack +{ + extern "C" + { + /* Basic Linear Algebra Communication Subprograms (BLACS) declarations */ + // https://www.ibm.com/support/knowledgecenter/SSNR5K_4.2.0/com.ibm.cluster.pessl.v4r2.pssl100.doc/am6gr_dinitb.htm#dinitb + + /** + * Determine how many processes are available and the current process rank. + * + * https://www.ibm.com/support/knowledgecenter/en/SSNR5K_4.2.0/com.ibm.cluster.pessl.v4r2.pssl100.doc/am6gr_dbpnf.htm + */ + void + Cblacs_pinfo(int *rank, int *nprocs); + + /** + * Return internal BLACS value in @p val based on the input @p what and @p icontxt. + * The most common use is in retrieving a default system context (@p what = 0, @p icontxt is ignored) + * to be used in BLACS_GRIDINIT or BLACS_GRIDMAP. + * + * https://www.ibm.com/support/knowledgecenter/en/SSNR5K_4.2.0/com.ibm.cluster.pessl.v4r2.pssl100.doc/am6gr_dbget.htm + */ + void + Cblacs_get(int icontxt, int what, int *val); + + /** + * Map the processes sequentially in row-major or column-major order + * into the process grid. Input arguments must be the same on every process. + * + * On return, @p context is the integer handle to the BLACS context, + * whereas on entry it is a system context to be used in creating the + * BLACS context. + * + * https://www.ibm.com/support/knowledgecenter/en/SSNR5K_4.2.0/com.ibm.cluster.pessl.v4r2.pssl100.doc/am6gr_dbint.htm + */ + void + Cblacs_gridinit(int * context, + const char *order, + int grid_height, + int grid_width); + + /** + * Return the process row and column index. + * + * https://www.ibm.com/support/knowledgecenter/en/SSNR5K_4.2.0/com.ibm.cluster.pessl.v4r2.pssl100.doc/am6gr_dbinfo.htm + */ + void + Cblacs_gridinfo(int context, + int *grid_height, + int *grid_width, + int *grid_row, + int *grid_col); + + /** + * Given the system process number, return the row and column coordinates in + * the BLACS' process grid. + */ + void + Cblacs_pcoord(int ictxt, int pnum, int *prow, int *pcol); + + /** + * Release a BLACS context. + */ + void + Cblacs_gridexit(int context); + + /** + * This routines holds up execution of all processes within the indicated + * scope until they have all called the routine. + */ + void + Cblacs_barrier(int, const char *); + + /** + * Free all BLACS contexts and releases all allocated memory. + */ + void + Cblacs_exit(int error_code); + + /** + * Receives a message from a process @prsrc, @p csrc into a general rectangular matrix. + * + * https://software.intel.com/en-us/mkl-developer-reference-c-gerv2d + */ + void + Cdgerv2d(int context, int M, int N, double *A, int lda, int rsrc, int csrc); + void + Csgerv2d(int context, int M, int N, float *A, int lda, int rsrc, int csrc); + + /** + * Sends the general rectangular matrix A to the destination + * process @p rdest @p cdest in the process grid. + * + * https://software.intel.com/en-us/mkl-developer-reference-c-2018-beta-gesd2d + */ + void + Cdgesd2d(int context, + int M, + int N, + double *A, + int lda, + int rdest, + int cdest); + void + Csgesd2d(int context, + int M, + int N, + float *A, + int lda, + int rdest, + int cdest); + + /** + * Get BLACS context from MPI @p comm. + */ + int + Csys2blacs_handle(MPI_Comm comm); + + /** + * Compute how many rows and columns each process owns (NUMber of Rows Or + * Columns). + * + * https://www.ibm.com/support/knowledgecenter/SSNR5K_4.2.0/com.ibm.cluster.pessl.v4r2.pssl100.doc/am6gr_dnumy.htm + */ + int + numroc_(const int *n, + const int *nb, + const int *iproc, + const int *isproc, + const int *nprocs); + /** + * Compute complex conjugate + */ + void + pzlacgv_(const int * N, + std::complex *A, + const int * IA, + const int * JA, + const int * DESCA, + const int * INCX); + + /** + * Compute the Cholesky factorization of an N-by-N real + * symmetric positive definite distributed matrix sub( A ) denoting + * A(IA:IA+N-1, JA:JA+N-1). + * + * http://www.netlib.org/scalapack/explore-html/d5/d9e/pdpotrf_8f_source.html + * https://www.ibm.com/support/knowledgecenter/SSNR5K_4.2.0/com.ibm.cluster.pessl.v4r2.pssl100.doc/am6gr_lpotrf.htm + */ + void + pdpotrf_(const char *UPLO, + const int * N, + double * A, + const int * IA, + const int * JA, + const int * DESCA, + int * INFO); + void + pspotrf_(const char *UPLO, + const int * N, + float * A, + const int * IA, + const int * JA, + const int * DESCA, + int * INFO); + void + pzpotrf_(const char * UPLO, + const int * N, + std::complex *A, + const int * IA, + const int * JA, + const int * DESCA, + int * INFO); + + /** + * Computes an LU factorization of a general distributed matrix sub( A ) + * using partial pivoting with row interchanges. + * + * http://www.netlib.org/scalapack/explore-html/df/dfe/pdgetrf_8f_source.html + * https://www.ibm.com/support/knowledgecenter/en/SSNR5K_4.2.0/com.ibm.cluster.pessl.v4r2.pssl100.doc/am6gr_lgetrf.htm + */ + void + pdgetrf_(const int *m, + const int *n, + double * A, + const int *IA, + const int *JA, + const int *DESCA, + int * ipiv, + int * INFO); + void + psgetrf_(const int *m, + const int *n, + float * A, + const int *IA, + const int *JA, + const int *DESCA, + int * ipiv, + int * INFO); + void + pzgetrf_(const int * m, + const int * n, + std::complex *A, + const int * IA, + const int * JA, + const int * DESCA, + int * ipiv, + int * INFO); + /** + * Compute the inverse of a real symmetric positive definite + * distributed matrix sub( A ) = A(IA:IA+N-1,JA:JA+N-1) using the + * Cholesky factorization sub( A ) = U**T*U or L*L**T computed by + * PDPOTRF. + * + * http://www.netlib.org/scalapack/explore-html/d2/d44/pdpotri_8f_source.html + * https://www.ibm.com/support/knowledgecenter/SSNR5K_4.2.0/com.ibm.cluster.pessl.v4r2.pssl100.doc/am6gr_lpotri.htm + * https://software.intel.com/en-us/mkl-developer-reference-c-p-potri + */ + void + pdpotri_(const char *UPLO, + const int * N, + double * A, + const int * IA, + const int * JA, + const int * DESCA, + int * INFO); + void + pspotri_(const char *UPLO, + const int * N, + float * A, + const int * IA, + const int * JA, + const int * DESCA, + int * INFO); + void + pzpotri_(const char * UPLO, + const int * N, + std::complex *A, + const int * IA, + const int * JA, + const int * DESCA, + int * INFO); + + /** + * PDGETRI computes the inverse of a distributed matrix using the LU + * factorization computed by PDGETRF. This method inverts U and then + * computes the inverse of sub( A ) = A(IA:IA+N-1,JA:JA+N-1) denoted + * InvA by solving the system InvA*L = inv(U) for InvA. + * + * http://www.netlib.org/scalapack/explore-html/d3/df3/pdgetri_8f_source.html + * https://www.ibm.com/support/knowledgecenter/SSNR5K_4.2.0/com.ibm.cluster.pessl.v4r2.pssl100.doc/am6gr_lgetri.htm + */ + void + pdgetri_(const int *N, + double * A, + const int *IA, + const int *JA, + const int *DESCA, + const int *ipiv, + double * work, + int * lwork, + int * iwork, + int * liwork, + int * info); + void + psgetri_(const int *N, + float * A, + const int *IA, + const int *JA, + const int *DESCA, + const int *ipiv, + float * work, + int * lwork, + int * iwork, + int * liwork, + int * info); + void + pzgetri_(const int * N, + std::complex *A, + const int * IA, + const int * JA, + const int * DESCA, + const int * ipiv, + std::complex *work, + int * lwork, + int * iwork, + int * liwork, + int * info); + + /** + * PDTRTRI computes the inverse of a upper or lower triangular + * distributed matrix sub( A ) = A(IA:IA+N-1,JA:JA+N-1). + * + * http://www.netlib.org/scalapack/explore-html/d9/dc0/pdtrtri_8f_source.html + * https://www.ibm.com/support/knowledgecenter/SSNR5K_4.2.0/com.ibm.cluster.pessl.v4r2.pssl100.doc/am6gr_lpdtri.htm + * https://software.intel.com/en-us/mkl-developer-reference-c-p-trtri + */ + void + pdtrtri_(const char *UPLO, + const char *DIAG, + const int * N, + double * A, + const int * IA, + const int * JA, + const int * DESCA, + int * INFO); + void + pstrtri_(const char *UPLO, + const char *DIAG, + const int * N, + float * A, + const int * IA, + const int * JA, + const int * DESCA, + int * INFO); + + void + pztrtri_(const char * UPLO, + const char * DIAG, + const int * N, + std::complex *A, + const int * IA, + const int * JA, + const int * DESCA, + int * INFO); + + /** + * Estimate the reciprocal of the condition number (in the + * l1-norm) of a real symmetric positive definite distributed matrix + * using the Cholesky factorization. + * + * https://www.ibm.com/support/knowledgecenter/SSNR5K_4.2.0/com.ibm.cluster.pessl.v4r2.pssl100.doc/am6gr_lpocon.htm#lpocon + * http://www.netlib.org/scalapack/explore-html/d4/df7/pdpocon_8f.html + * https://software.intel.com/en-us/mkl-developer-reference-fortran-pocon + */ + void + pdpocon_(const char * uplo, + const int * N, + const double *A, + const int * IA, + const int * JA, + const int * DESCA, + const double *ANORM, + double * RCOND, + double * WORK, + const int * LWORK, + int * IWORK, + const int * LIWORK, + int * INFO); + void + pspocon_(const char * uplo, + const int * N, + const float *A, + const int * IA, + const int * JA, + const int * DESCA, + const float *ANORM, + float * RCOND, + float * WORK, + const int * LWORK, + int * IWORK, + const int * LIWORK, + int * INFO); + + /** + * Norm of a real symmetric matrix + * + * http://www.netlib.org/scalapack/explore-html/dd/d12/pdlansy_8f_source.html + * https://www.ibm.com/support/knowledgecenter/SSNR5K_4.2.0/com.ibm.cluster.pessl.v4r2.pssl100.doc/am6gr_pdlansy.htm#pdlansy + */ + double + pdlansy_(const char * norm, + const char * uplo, + const int * N, + const double *A, + const int * IA, + const int * JA, + const int * DESCA, + double * work); + float + pslansy_(const char * norm, + const char * uplo, + const int * N, + const float *A, + const int * IA, + const int * JA, + const int * DESCA, + float * work); + + /** + * Compute the Least Common Multiple (LCM) of two positive integers @p M and @p N. + * In fact the routine Compute the greatest common divisor (GCD) and + * use the fact that M*N = GCD*LCM. + * + * http://www.netlib.org/scalapack/explore-html/d0/d9b/ilcm_8f_source.html + */ + int + ilcm_(const int *M, const int *N); + + /** + * Return the ceiling of the division of two integers. + * + * http://www.netlib.org/scalapack/explore-html/df/d07/iceil_8f_source.html + */ + int + iceil_(const int *i1, const int *i2); + + /** + * Initialize the descriptor vector with the 8 input arguments + */ + void + descinit_(int * desc, + const int *m, + const int *n, + const int *mb, + const int *nb, + const int *irsrc, + const int *icsrc, + const int *ictxt, + const int *lld, + int * info); + + /** + * Compute the global index of a distributed matrix entry + * pointed to by the local index @p indxloc of the process indicated by + * @p iproc. + * + * @param indxloc The local index of the distributed matrix entry. + * @param nb Block size, size of the blocks the distributed matrix is split + * into. + * @param iproc The coordinate of the process whose local array row or column + * is to be determined + * @param isrcproc The coordinate of the process that possesses the first + * row/column of the distributed matrix + * @param nprocs The total number processes over which the distributed matrix + * is distributed + */ + int + indxl2g_(const int *indxloc, + const int *nb, + const int *iproc, + const int *isrcproc, + const int *nprocs); + + /** + * Compute the solution to a real system of linear equations + */ + void + pdgesv_(const int *n, + const int *nrhs, + double * A, + const int *ia, + const int *ja, + const int *desca, + int * ipiv, + double * B, + const int *ib, + const int *jb, + const int *descb, + int * info); + void + psgesv_(const int *n, + const int *nrhs, + float * A, + const int *ia, + const int *ja, + const int *desca, + int * ipiv, + float * B, + const int *ib, + const int *jb, + const int *descb, + int * info); + + /** + * Perform one of the matrix-matrix operations: + * @f{align*}{ + * \mathrm{sub}(C) &\dealcoloneq \alpha + * op(\mathrm{sub}(A))op(\mathrm{sub}(B)) + * + \beta \mathrm{sub}(C), \\ + * \mathrm{sub}(C) &\dealcoloneq \alpha + * op(\mathrm{sub}(A))op(\mathrm{sub}(B)) + * + beta sub(C), + * @f} + * where + * $\mathrm{sub}(C)$ denotes C(IC:IC+M-1,JC:JC+N-1), and, $op(X)$ is one of + * $op(X) = X$ or $op(X) = X^T$. + */ + void + pdgemm_(const char * transa, + const char * transb, + const int * m, + const int * n, + const int * k, + const double *alpha, + const double *A, + const int * IA, + const int * JA, + const int * DESCA, + const double *B, + const int * IB, + const int * JB, + const int * DESCB, + const double *beta, + double * C, + const int * IC, + const int * JC, + const int * DESCC); + void + psgemm_(const char * transa, + const char * transb, + const int * m, + const int * n, + const int * k, + const float *alpha, + const float *A, + const int * IA, + const int * JA, + const int * DESCA, + const float *B, + const int * IB, + const int * JB, + const int * DESCB, + const float *beta, + float * C, + const int * IC, + const int * JC, + const int * DESCC); + + void + pzgemm_(const char * transa, + const char * transb, + const int * m, + const int * n, + const int * k, + const std::complex *alpha, + const std::complex *A, + const int * IA, + const int * JA, + const int * DESCA, + const std::complex *B, + const int * IB, + const int * JB, + const int * DESCB, + const std::complex *beta, + std::complex * C, + const int * IC, + const int * JC, + const int * DESCC); + + /** + * Return the value of the one norm, or the Frobenius norm, or the infinity + * norm, or the element of largest absolute value of a distributed matrix + */ + double + pdlange_(char const * norm, + const int * m, + const int * n, + const double *A, + const int * ia, + const int * ja, + const int * desca, + double * work); + float + pslange_(const char * norm, + const int * m, + const int * n, + const float *A, + const int * ia, + const int * ja, + const int * desca, + float * work); + + /** + * Compute the process coordinate which possesses the entry of a + * distributed matrix specified by a global index + */ + int + indxg2p_(const int *glob, + const int *nb, + const int *iproc, + const int *isproc, + const int *nprocs); + + /** + * Compute all eigenvalues and, optionally, eigenvectors of a real symmetric + * matrix A by calling the recommended sequence of ScaLAPACK routines. In + * its present form, the routine assumes a homogeneous system and makes no + * checks for consistency of the eigenvalues or eigenvectors across the + * different processes. Because of this, it is possible that a heterogeneous + * system may return incorrect results without any error messages. + * + * http://www.netlib.org/scalapack/explore-html/d0/d1a/pdsyev_8f.html + * https://www.ibm.com/support/knowledgecenter/SSNR5K_4.2.0/com.ibm.cluster.pessl.v4r2.pssl100.doc/am6gr_lsyev.htm#lsyev + */ + void + pdsyev_(const char *jobz, + const char *uplo, + const int * m, + double * A, + const int * ia, + const int * ja, + int * desca, + double * w, + double * z, + const int * iz, + const int * jz, + int * descz, + double * work, + const int * lwork, + int * info); + void + pssyev_(const char *jobz, + const char *uplo, + const int * m, + float * A, + const int * ia, + const int * ja, + int * desca, + float * w, + float * z, + const int * iz, + const int * jz, + int * descz, + float * work, + const int * lwork, + int * info); + void + pzheev_(const char * jobz, + const char * uplo, + const int * m, + std::complex *A, + const int * ia, + const int * ja, + int * desca, + double * w, + std::complex *z, + const int * iz, + const int * jz, + int * descz, + std::complex *work, + const int * lwork, + int * info); + + /** + * Copy all or a part of a distributed matrix A to another distributed + * matrix B. No communication is performed, pdlacpy performs a local copy + * $\mathrm{sub}(A) \dealcoloneq \mathrm{sub}(B)$, where $\mathrm{sub}(A)$ + * denotes $A(ia:ia+m-1, ja:ja+n-1)$ and $\mathrm{sub}(B)$ denotes + * $B(ib:ib+m-1, jb:jb+n-1)$. + */ + void + pdlacpy_(const char * uplo, + const int * m, + const int * n, + const double *A, + const int * ia, + const int * ja, + const int * desca, + double * B, + const int * ib, + const int * jb, + const int * descb); + void + pslacpy_(const char * uplo, + const int * m, + const int * n, + const float *A, + const int * ia, + const int * ja, + const int * desca, + float * B, + const int * ib, + const int * jb, + const int * descb); + + /** + * Copies the content of a general rectangular distributed matrix @p A to another distributed matrix @p B + * It is not required that the matrices A and B have the same process grid + * or block size, e.g. copying a matrix from a one-dimensional to a + * two-dimensional process grid + * @p ictxt is a context which is at least a union of all processes in + * context A and B + */ + void + pdgemr2d_(const int * m, + const int * n, + const double *A, + const int * ia, + const int * ja, + const int * desca, + double * B, + const int * ib, + const int * jb, + const int * descb, + const int * ictxt); + void + psgemr2d_(const int * m, + const int * n, + const float *A, + const int * ia, + const int * ja, + const int * desca, + float * B, + const int * ib, + const int * jb, + const int * descb, + const int * ictxt); + + /** + * helper routines determining machine precision + */ + double + pdlamch_(const int *ictxt, const char *cmach); + float + pslamch_(const int *ictxt, const char *cmach); + + + /** + * psyevx computes selected eigenvalues and, optionally, eigenvectors + * of a real symmetric matrix A. Eigenvalues/vectors can be selected by + * specifying a range of values or a range of indices for the desired + * eigenvalues. + */ + void + pdsyevx_(const char * jobz, + const char * range, + const char * uplo, + const int * n, + double * A, + const int * ia, + const int * ja, + const int * desca, + const double *VL, + const double *VU, + const int * il, + const int * iu, + const double *abstol, + const int * m, + const int * nz, + double * w, + double * orfac, + double * Z, + const int * iz, + const int * jz, + const int * descz, + double * work, + int * lwork, + int * iwork, + int * liwork, + int * ifail, + int * iclustr, + double * gap, + int * info); + void + pssyevx_(const char * jobz, + const char * range, + const char * uplo, + const int * n, + float * A, + const int * ia, + const int * ja, + const int * desca, + const float *VL, + const float *VU, + const int * il, + const int * iu, + const float *abstol, + const int * m, + const int * nz, + float * w, + float * orfac, + float * Z, + const int * iz, + const int * jz, + const int * descz, + float * work, + int * lwork, + int * iwork, + int * liwork, + int * ifail, + int * iclustr, + float * gap, + int * info); + void + pzheevx_(const char * jobz, + const char * range, + const char * uplo, + const int * n, + std::complex *A, + const int * ia, + const int * ja, + const int * desca, + const double * VL, + const double * VU, + const int * il, + const int * iu, + const double * abstol, + const int * m, + const int * nz, + double * w, + double * orfac, + std::complex *Z, + const int * iz, + const int * jz, + const int * descz, + std::complex *work, + int * lwork, + int * iwork, + int * liwork, + int * ifail, + int * iclustr, + double * gap, + int * info); + + /* + * PDGESVD computes the singular value decomposition (SVD) of an + * M-by-N matrix A, optionally computing the left and/or right + * singular vectors + */ + void + pdgesvd_(const char *jobu, + const char *jobvt, + const int * m, + const int * n, + double * A, + const int * ia, + const int * ja, + const int * desca, + double * S, + double * U, + const int * iu, + const int * ju, + const int * descu, + double * VT, + const int * ivt, + const int * jvt, + const int * descvt, + double * work, + int * lwork, + int * info); + void + psgesvd_(const char *jobu, + const char *jobvt, + const int * m, + const int * n, + float * A, + const int * ia, + const int * ja, + const int * desca, + float * S, + float * U, + const int * iu, + const int * ju, + const int * descu, + float * VT, + const int * ivt, + const int * jvt, + const int * descvt, + float * work, + int * lwork, + int * info); + + /* + * P_GELS solves overdetermined or underdetermined real linear + * systems involving an M-by-N matrix A, or its transpose, + * using a QR or LQ factorization of A. It is assumed that A has full rank. + */ + void + pdgels_(const char *trans, + const int * m, + const int * n, + const int * nrhs, + double * A, + const int * ia, + const int * ja, + const int * desca, + double * B, + const int * ib, + const int * jb, + const int * descb, + double * work, + int * lwork, + int * info); + void + psgels_(const char *trans, + const int * m, + const int * n, + const int * nrhs, + float * A, + const int * ia, + const int * ja, + const int * desca, + float * B, + const int * ib, + const int * jb, + const int * descb, + float * work, + int * lwork, + int * info); + + /* + * Perform matrix sum: + * @f{equation*} + * C \dealcoloneq \beta C + \alpha op(A), + * @f + * where $op(A)$ denotes either $op(A) = A$ or $op(A)=A^T$. + */ + void + pdgeadd_(const char * transa, + const int * m, + const int * n, + const double *alpha, + const double *A, + const int * IA, + const int * JA, + const int * DESCA, + const double *beta, + double * C, + const int * IC, + const int * JC, + const int * DESCC); + void + psgeadd_(const char * transa, + const int * m, + const int * n, + const float *alpha, + const float *A, + const int * IA, + const int * JA, + const int * DESCA, + const float *beta, + float * C, + const int * IC, + const int * JC, + const int * DESCC); + void + pzgeadd_(const char * transa, + const int * m, + const int * n, + const std::complex *alpha, + const std::complex *A, + const int * IA, + const int * JA, + const int * DESCA, + const std::complex *beta, + std::complex * C, + const int * IC, + const int * JC, + const int * DESCC); + + /** + * Routine to transpose a matrix: + * C = beta C + alpha A^T + */ + void + pdtran_(const int * m, + const int * n, + const double *alpha, + const double *A, + const int * IA, + const int * JA, + const int * DESCA, + const double *beta, + double * C, + const int * IC, + const int * JC, + const int * DESCC); + void + pstran_(const int * m, + const int * n, + const float *alpha, + const float *A, + const int * IA, + const int * JA, + const int * DESCA, + const float *beta, + float * C, + const int * IC, + const int * JC, + const int * DESCC); + + /** + * psyevr computes selected eigenvalues and, optionally, eigenvectors + * of a real symmetric matrix A using a parallel implementation of the MRR + * algorithm. Eigenvalues/vectors can be selected by specifying a range of + * values or a range of indices for the desired eigenvalues. + */ + void + pdsyevr_(const char * jobz, + const char * range, + const char * uplo, + const int * n, + double * A, + const int * IA, + const int * JA, + const int * DESCA, + const double *VL, + const double *VU, + const int * IL, + const int * IU, + int * m, + int * nz, + double * w, + double * Z, + const int * IZ, + const int * JZ, + const int * DESCZ, + double * work, + int * lwork, + int * iwork, + int * liwork, + int * info); + void + pssyevr_(const char * jobz, + const char * range, + const char * uplo, + const int * n, + float * A, + const int * IA, + const int * JA, + const int * DESCA, + const float *VL, + const float *VU, + const int * IL, + const int * IU, + int * m, + int * nz, + float * w, + float * Z, + const int * IZ, + const int * JZ, + const int * DESCZ, + float * work, + int * lwork, + int * iwork, + int * liwork, + int * info); + void + pzheevr_(const char * jobz, + const char * range, + const char * uplo, + const int * n, + std::complex *A, + const int * IA, + const int * JA, + const int * DESCA, + const double * VL, + const double * VU, + const int * IL, + const int * IU, + int * m, + int * nz, + double * w, + std::complex *Z, + const int * IZ, + const int * JZ, + const int * DESCZ, + std::complex *work, + int * lwork, + int * iwork, + int * liwork, + int * info); + } + + + + /* + * In the following we have template wrappers for the ScaLAPACK routines + * wrappers for other numeric types can be added in the future + */ + template + inline void + Cgerv2d(int /*context*/, + int /*M*/, + int /*N*/, + number * /*A*/, + int /*lda*/, + int /*rsrc*/, + int /*csrc*/) + { + AssertThrow(false, dealii::ExcNotImplemented()); + } + + inline void + Cgerv2d(int context, int M, int N, double *A, int lda, int rsrc, int csrc) + { + Cdgerv2d(context, M, N, A, lda, rsrc, csrc); + } + + inline void + Cgerv2d(int context, int M, int N, float *A, int lda, int rsrc, int csrc) + { + Csgerv2d(context, M, N, A, lda, rsrc, csrc); + } + + + template + inline void + Cgesd2d(int /*context*/, + int /*M*/, + int /*N*/, + number * /*A*/, + int /*lda*/, + int /*rdest*/, + int /*cdest*/) + { + AssertThrow(false, dealii::ExcNotImplemented()); + } + + inline void + Cgesd2d(int context, int M, int N, double *A, int lda, int rdest, int cdest) + { + Cdgesd2d(context, M, N, A, lda, rdest, cdest); + } + + inline void + Cgesd2d(int context, int M, int N, float *A, int lda, int rdest, int cdest) + { + Csgesd2d(context, M, N, A, lda, rdest, cdest); + } + + inline void + pplacgv(const int *N, + double * A, + const int *IA, + const int *JA, + const int *DESCA, + const int *INCX) + {} + + inline void + pplacgv(const int * N, + std::complex *A, + const int * IA, + const int * JA, + const int * DESCA, + const int * INCX) + { + pzlacgv_(N, A, IA, JA, DESCA, INCX); + } + + template + inline void + ppotrf(const char * /*UPLO*/, + const int * /*N*/, + number * /*A*/, + const int * /*IA*/, + const int * /*JA*/, + const int * /*DESCA*/, + int * /*INFO*/) + { + AssertThrow(false, dealii::ExcNotImplemented()); + } + + inline void + ppotrf(const char *UPLO, + const int * N, + double * A, + const int * IA, + const int * JA, + const int * DESCA, + int * INFO) + { + pdpotrf_(UPLO, N, A, IA, JA, DESCA, INFO); + } + + inline void + ppotrf(const char *UPLO, + const int * N, + float * A, + const int * IA, + const int * JA, + const int * DESCA, + int * INFO) + { + pspotrf_(UPLO, N, A, IA, JA, DESCA, INFO); + } + + inline void + ppotrf(const char * UPLO, + const int * N, + std::complex *A, + const int * IA, + const int * JA, + const int * DESCA, + int * INFO) + { + pzpotrf_(UPLO, N, A, IA, JA, DESCA, INFO); + } + + template + inline void + pgetrf(const int * /*m*/, + const int * /*n*/, + number * /*A*/, + const int * /*IA*/, + const int * /*JA*/, + const int * /*DESCA*/, + int * /*ipiv*/, + int * /*INFO*/) + { + AssertThrow(false, dealii::ExcNotImplemented()); + } + + inline void + pgetrf(const int *m, + const int *n, + double * A, + const int *IA, + const int *JA, + const int *DESCA, + int * ipiv, + int * INFO) + { + pdgetrf_(m, n, A, IA, JA, DESCA, ipiv, INFO); + } + + inline void + pgetrf(const int *m, + const int *n, + float * A, + const int *IA, + const int *JA, + const int *DESCA, + int * ipiv, + int * INFO) + { + psgetrf_(m, n, A, IA, JA, DESCA, ipiv, INFO); + } + + inline void + pgetrf(const int * m, + const int * n, + std::complex *A, + const int * IA, + const int * JA, + const int * DESCA, + int * ipiv, + int * INFO) + { + pzgetrf_(m, n, A, IA, JA, DESCA, ipiv, INFO); + } + + template + inline void + ppotri(const char * /*UPLO*/, + const int * /*N*/, + number * /*A*/, + const int * /*IA*/, + const int * /*JA*/, + const int * /*DESCA*/, + int * /*INFO*/) + { + AssertThrow(false, dealii::ExcNotImplemented()); + } + + inline void + ppotri(const char *UPLO, + const int * N, + double * A, + const int * IA, + const int * JA, + const int * DESCA, + int * INFO) + { + pdpotri_(UPLO, N, A, IA, JA, DESCA, INFO); + } + + inline void + ppotri(const char *UPLO, + const int * N, + float * A, + const int * IA, + const int * JA, + const int * DESCA, + int * INFO) + { + pspotri_(UPLO, N, A, IA, JA, DESCA, INFO); + } + + inline void + ppotri(const char * UPLO, + const int * N, + std::complex *A, + const int * IA, + const int * JA, + const int * DESCA, + int * INFO) + { + pzpotri_(UPLO, N, A, IA, JA, DESCA, INFO); + } + + template + inline void + pgetri(const int * /*N*/, + number * /*A*/, + const int * /*IA*/, + const int * /*JA*/, + const int * /*DESCA*/, + const int * /*ipiv*/, + number * /*work*/, + int * /*lwork*/, + int * /*iwork*/, + int * /*liwork*/, + int * /*info*/) + { + AssertThrow(false, dealii::ExcNotImplemented()); + } + + inline void + pgetri(const int *N, + double * A, + const int *IA, + const int *JA, + const int *DESCA, + const int *ipiv, + double * work, + int * lwork, + int * iwork, + int * liwork, + int * info) + { + pdgetri_(N, A, IA, JA, DESCA, ipiv, work, lwork, iwork, liwork, info); + } + + inline void + pgetri(const int *N, + float * A, + const int *IA, + const int *JA, + const int *DESCA, + const int *ipiv, + float * work, + int * lwork, + int * iwork, + int * liwork, + int * info) + { + psgetri_(N, A, IA, JA, DESCA, ipiv, work, lwork, iwork, liwork, info); + } + + inline void + pgetri(const int * N, + std::complex *A, + const int * IA, + const int * JA, + const int * DESCA, + const int * ipiv, + std::complex *work, + int * lwork, + int * iwork, + int * liwork, + int * info) + { + pzgetri_(N, A, IA, JA, DESCA, ipiv, work, lwork, iwork, liwork, info); + } + + + + template + inline void + ptrtri(const char * /*UPLO*/, + const char * /*DIAG*/, + const int * /*N*/, + number * /*A*/, + const int * /*IA*/, + const int * /*JA*/, + const int * /*DESCA*/, + int * /*INFO*/) + { + AssertThrow(false, dealii::ExcNotImplemented()); + } + + inline void + ptrtri(const char *UPLO, + const char *DIAG, + const int * N, + double * A, + const int * IA, + const int * JA, + const int * DESCA, + int * INFO) + { + pdtrtri_(UPLO, DIAG, N, A, IA, JA, DESCA, INFO); + } + + inline void + ptrtri(const char *UPLO, + const char *DIAG, + const int * N, + float * A, + const int * IA, + const int * JA, + const int * DESCA, + int * INFO) + { + pstrtri_(UPLO, DIAG, N, A, IA, JA, DESCA, INFO); + } + + inline void + ptrtri(const char * UPLO, + const char * DIAG, + const int * N, + std::complex *A, + const int * IA, + const int * JA, + const int * DESCA, + int * INFO) + { + pztrtri_(UPLO, DIAG, N, A, IA, JA, DESCA, INFO); + } + + template + inline void + ppocon(const char * /*uplo*/, + const int * /*N*/, + const number * /*A*/, + const int * /*IA*/, + const int * /*JA*/, + const int * /*DESCA*/, + const number * /*ANORM*/, + number * /*RCOND*/, + number * /*WORK*/, + const int * /*LWORK*/, + int * /*IWORK*/, + const int * /*LIWORK*/, + int * /*INFO*/) + { + AssertThrow(false, dealii::ExcNotImplemented()); + } + + inline void + ppocon(const char * uplo, + const int * N, + const double *A, + const int * IA, + const int * JA, + const int * DESCA, + const double *ANORM, + double * RCOND, + double * WORK, + const int * LWORK, + int * IWORK, + const int * LIWORK, + int * INFO) + { + pdpocon_(uplo, + N, + A, + IA, + JA, + DESCA, + ANORM, + RCOND, + WORK, + LWORK, + IWORK, + LIWORK, + INFO); + } + + inline void + ppocon(const char * uplo, + const int * N, + const float *A, + const int * IA, + const int * JA, + const int * DESCA, + const float *ANORM, + float * RCOND, + float * WORK, + const int * LWORK, + int * IWORK, + const int * LIWORK, + int * INFO) + { + pspocon_(uplo, + N, + A, + IA, + JA, + DESCA, + ANORM, + RCOND, + WORK, + LWORK, + IWORK, + LIWORK, + INFO); + } + + + template + inline number + plansy(const char * /*norm*/, + const char * /*uplo*/, + const int * /*N*/, + const number * /*A*/, + const int * /*IA*/, + const int * /*JA*/, + const int * /*DESCA*/, + number * /*work*/) + { + AssertThrow(false, dealii::ExcNotImplemented()); + } + + inline double + plansy(const char * norm, + const char * uplo, + const int * N, + const double *A, + const int * IA, + const int * JA, + const int * DESCA, + double * work) + { + return pdlansy_(norm, uplo, N, A, IA, JA, DESCA, work); + } + + inline float + plansy(const char * norm, + const char * uplo, + const int * N, + const float *A, + const int * IA, + const int * JA, + const int * DESCA, + float * work) + { + return pslansy_(norm, uplo, N, A, IA, JA, DESCA, work); + } + + + template + inline void + pgesv(const int * /*n*/, + const int * /*nrhs*/, + number * /*A*/, + const int * /*ia*/, + const int * /*ja*/, + const int * /*desca*/, + int * /*ipiv*/, + number * /*B*/, + const int * /*ib*/, + const int * /*jb*/, + const int * /*descb*/, + int * /*info*/) + { + AssertThrow(false, dealii::ExcNotImplemented()); + } + + inline void + pgesv(const int *n, + const int *nrhs, + double * A, + const int *ia, + const int *ja, + const int *desca, + int * ipiv, + double * B, + const int *ib, + const int *jb, + const int *descb, + int * info) + { + pdgesv_(n, nrhs, A, ia, ja, desca, ipiv, B, ib, jb, descb, info); + } + + inline void + pgesv(const int *n, + const int *nrhs, + float * A, + const int *ia, + const int *ja, + const int *desca, + int * ipiv, + float * B, + const int *ib, + const int *jb, + const int *descb, + int * info) + { + psgesv_(n, nrhs, A, ia, ja, desca, ipiv, B, ib, jb, descb, info); + } + + + template + inline void + pgemm(const char * /*transa*/, + const char * /*transb*/, + const int * /*m*/, + const int * /*n*/, + const int * /*k*/, + const number * /*alpha*/, + const number * /*A*/, + const int * /*IA*/, + const int * /*JA*/, + const int * /*DESCA*/, + const number * /*B*/, + const int * /*IB*/, + const int * /*JB*/, + const int * /*DESCB*/, + const number * /*beta*/, + number * /*C*/, + const int * /*IC*/, + const int * /*JC*/, + const int * /*DESCC*/) + { + AssertThrow(false, dealii::ExcNotImplemented()); + } + + inline void + pgemm(const char * transa, + const char * transb, + const int * m, + const int * n, + const int * k, + const double *alpha, + const double *A, + const int * IA, + const int * JA, + const int * DESCA, + const double *B, + const int * IB, + const int * JB, + const int * DESCB, + const double *beta, + double * C, + const int * IC, + const int * JC, + const int * DESCC) + { + pdgemm_(transa, + transb, + m, + n, + k, + alpha, + A, + IA, + JA, + DESCA, + B, + IB, + JB, + DESCB, + beta, + C, + IC, + JC, + DESCC); + } + + inline void + pgemm(const char * transa, + const char * transb, + const int * m, + const int * n, + const int * k, + const float *alpha, + const float *A, + const int * IA, + const int * JA, + const int * DESCA, + const float *B, + const int * IB, + const int * JB, + const int * DESCB, + const float *beta, + float * C, + const int * IC, + const int * JC, + const int * DESCC) + { + psgemm_(transa, + transb, + m, + n, + k, + alpha, + A, + IA, + JA, + DESCA, + B, + IB, + JB, + DESCB, + beta, + C, + IC, + JC, + DESCC); + } + + + inline void + pgemm(const char * transa, + const char * transb, + const int * m, + const int * n, + const int * k, + const std::complex *alpha, + const std::complex *A, + const int * IA, + const int * JA, + const int * DESCA, + const std::complex *B, + const int * IB, + const int * JB, + const int * DESCB, + const std::complex *beta, + std::complex * C, + const int * IC, + const int * JC, + const int * DESCC) + { + pzgemm_(transa, + transb, + m, + n, + k, + alpha, + A, + IA, + JA, + DESCA, + B, + IB, + JB, + DESCB, + beta, + C, + IC, + JC, + DESCC); + } + + template + inline number + plange(const char * /*norm*/, + const int * /*m*/, + const int * /*n*/, + const number * /*A*/, + const int * /*ia*/, + const int * /*ja*/, + const int * /*desca*/, + number * /*work*/) + { + AssertThrow(false, dealii::ExcNotImplemented()); + } + + inline double + plange(const char * norm, + const int * m, + const int * n, + const double *A, + const int * ia, + const int * ja, + const int * desca, + double * work) + { + return pdlange_(norm, m, n, A, ia, ja, desca, work); + } + + inline float + plange(const char * norm, + const int * m, + const int * n, + const float *A, + const int * ia, + const int * ja, + const int * desca, + float * work) + { + return pslange_(norm, m, n, A, ia, ja, desca, work); + } + + + template + inline void + psyev(const char * /*jobz*/, + const char * /*uplo*/, + const int * /*m*/, + number * /*A*/, + const int * /*ia*/, + const int * /*ja*/, + int * /*desca*/, + number * /*w*/, + number * /*z*/, + const int * /*iz*/, + const int * /*jz*/, + int * /*descz*/, + number * /*work*/, + const int * /*lwork*/, + int * /*info*/) + { + AssertThrow(false, dealii::ExcNotImplemented()); + } + + inline void + psyev(const char *jobz, + const char *uplo, + const int * m, + double * A, + const int * ia, + const int * ja, + int * desca, + double * w, + double * z, + const int * iz, + const int * jz, + int * descz, + double * work, + const int * lwork, + int * info) + { + pdsyev_( + jobz, uplo, m, A, ia, ja, desca, w, z, iz, jz, descz, work, lwork, info); + } + + inline void + psyev(const char *jobz, + const char *uplo, + const int * m, + float * A, + const int * ia, + const int * ja, + int * desca, + float * w, + float * z, + const int * iz, + const int * jz, + int * descz, + float * work, + const int * lwork, + int * info) + { + pssyev_( + jobz, uplo, m, A, ia, ja, desca, w, z, iz, jz, descz, work, lwork, info); + } + + inline void + psyev(const char * jobz, + const char * uplo, + const int * m, + std::complex *A, + const int * ia, + const int * ja, + int * desca, + double * w, + std::complex *z, + const int * iz, + const int * jz, + int * descz, + std::complex *work, + const int * lwork, + int * info) + { + pzheev_( + jobz, uplo, m, A, ia, ja, desca, w, z, iz, jz, descz, work, lwork, info); + } + + template + inline void + placpy(const char * /*uplo*/, + const int * /*m*/, + const int * /*n*/, + const number * /*A*/, + const int * /*ia*/, + const int * /*ja*/, + const int * /*desca*/, + number * /*B*/, + const int * /*ib*/, + const int * /*jb*/, + const int * /*descb*/) + { + AssertThrow(false, dealii::ExcNotImplemented()); + } + + inline void + placpy(const char * uplo, + const int * m, + const int * n, + const double *A, + const int * ia, + const int * ja, + const int * desca, + double * B, + const int * ib, + const int * jb, + const int * descb) + { + pdlacpy_(uplo, m, n, A, ia, ja, desca, B, ib, jb, descb); + } + + inline void + placpy(const char * uplo, + const int * m, + const int * n, + const float *A, + const int * ia, + const int * ja, + const int * desca, + float * B, + const int * ib, + const int * jb, + const int * descb) + { + pslacpy_(uplo, m, n, A, ia, ja, desca, B, ib, jb, descb); + } + + + template + inline void + pgemr2d(const int * /*m*/, + const int * /*n*/, + const number * /*A*/, + const int * /*ia*/, + const int * /*ja*/, + const int * /*desca*/, + number * /*B*/, + const int * /*ib*/, + const int * /*jb*/, + const int * /*descb*/, + const int * /*ictxt*/) + { + AssertThrow(false, dealii::ExcNotImplemented()); + } + + inline void + pgemr2d(const int * m, + const int * n, + const double *A, + const int * ia, + const int * ja, + const int * desca, + double * B, + const int * ib, + const int * jb, + const int * descb, + const int * ictxt) + { + pdgemr2d_(m, n, A, ia, ja, desca, B, ib, jb, descb, ictxt); + } + + inline void + pgemr2d(const int * m, + const int * n, + const float *A, + const int * ia, + const int * ja, + const int * desca, + float * B, + const int * ib, + const int * jb, + const int * descb, + const int * ictxt) + { + psgemr2d_(m, n, A, ia, ja, desca, B, ib, jb, descb, ictxt); + } + + + template + inline void + plamch(const int * /*ictxt*/, const char * /*cmach*/, number & /*val*/) + { + AssertThrow(false, dealii::ExcNotImplemented()); + } + + inline void + plamch(const int *ictxt, const char *cmach, double &val) + { + val = pdlamch_(ictxt, cmach); + } + + inline void + plamch(const int *ictxt, const char *cmach, float &val) + { + val = pslamch_(ictxt, cmach); + } + + + template + inline void + psyevx(const char * /*jobz*/, + const char * /*range*/, + const char * /*uplo*/, + const int * /*n*/, + number * /*A*/, + const int * /*ia*/, + const int * /*ja*/, + const int * /*desca*/, + number * /*VL*/, + number * /*VU*/, + const int * /*il*/, + const int * /*iu*/, + number * /*abstol*/, + const int * /*m*/, + const int * /*nz*/, + number * /*w*/, + number * /*orfac*/, + number * /*Z*/, + const int * /*iz*/, + const int * /*jz*/, + const int * /*descz*/, + number * /*work*/, + int * /*lwork*/, + int * /*iwork*/, + int * /*liwork*/, + int * /*ifail*/, + int * /*iclustr*/, + number * /*gap*/, + int * /*info*/) + { + AssertThrow(false, dealii::ExcNotImplemented()); + } + + inline void + psyevx(const char *jobz, + const char *range, + const char *uplo, + const int * n, + double * A, + const int * ia, + const int * ja, + const int * desca, + double * VL, + double * VU, + const int * il, + const int * iu, + double * abstol, + const int * m, + const int * nz, + double * w, + double * orfac, + double * Z, + const int * iz, + const int * jz, + const int * descz, + double * work, + int * lwork, + int * iwork, + int * liwork, + int * ifail, + int * iclustr, + double * gap, + int * info) + { + pdsyevx_(jobz, + range, + uplo, + n, + A, + ia, + ja, + desca, + VL, + VU, + il, + iu, + abstol, + m, + nz, + w, + orfac, + Z, + iz, + jz, + descz, + work, + lwork, + iwork, + liwork, + ifail, + iclustr, + gap, + info); + } + + inline void + psyevx(const char *jobz, + const char *range, + const char *uplo, + const int * n, + float * A, + const int * ia, + const int * ja, + const int * desca, + float * VL, + float * VU, + const int * il, + const int * iu, + float * abstol, + const int * m, + const int * nz, + float * w, + float * orfac, + float * Z, + const int * iz, + const int * jz, + const int * descz, + float * work, + int * lwork, + int * iwork, + int * liwork, + int * ifail, + int * iclustr, + float * gap, + int * info) + { + pssyevx_(jobz, + range, + uplo, + n, + A, + ia, + ja, + desca, + VL, + VU, + il, + iu, + abstol, + m, + nz, + w, + orfac, + Z, + iz, + jz, + descz, + work, + lwork, + iwork, + liwork, + ifail, + iclustr, + gap, + info); + } + + inline void + psyevx(const char * jobz, + const char * range, + const char * uplo, + const int * n, + std::complex *A, + const int * ia, + const int * ja, + const int * desca, + double * VL, + double * VU, + const int * il, + const int * iu, + double * abstol, + const int * m, + const int * nz, + double * w, + double * orfac, + std::complex *Z, + const int * iz, + const int * jz, + const int * descz, + std::complex *work, + int * lwork, + int * iwork, + int * liwork, + int * ifail, + int * iclustr, + double * gap, + int * info) + { + pzheevx_(jobz, + range, + uplo, + n, + A, + ia, + ja, + desca, + VL, + VU, + il, + iu, + abstol, + m, + nz, + w, + orfac, + Z, + iz, + jz, + descz, + work, + lwork, + iwork, + liwork, + ifail, + iclustr, + gap, + info); + } + + template + inline void + pgesvd(const char * /*jobu*/, + const char * /*jobvt*/, + const int * /*m*/, + const int * /*n*/, + number * /*A*/, + const int * /*ia*/, + const int * /*ja*/, + const int * /*desca*/, + number * /*S*/, + number * /*U*/, + const int * /*iu*/, + const int * /*ju*/, + const int * /*descu*/, + number * /*VT*/, + const int * /*ivt*/, + const int * /*jvt*/, + const int * /*descvt*/, + number * /*work*/, + int * /*lwork*/, + int * /*info*/) + { + AssertThrow(false, dealii::ExcNotImplemented()); + } + + inline void + pgesvd(const char *jobu, + const char *jobvt, + const int * m, + const int * n, + double * A, + const int * ia, + const int * ja, + const int * desca, + double * S, + double * U, + const int * iu, + const int * ju, + const int * descu, + double * VT, + const int * ivt, + const int * jvt, + const int * descvt, + double * work, + int * lwork, + int * info) + { + pdgesvd_(jobu, + jobvt, + m, + n, + A, + ia, + ja, + desca, + S, + U, + iu, + ju, + descu, + VT, + ivt, + jvt, + descvt, + work, + lwork, + info); + } + + inline void + pgesvd(const char *jobu, + const char *jobvt, + const int * m, + const int * n, + float * A, + const int * ia, + const int * ja, + const int * desca, + float * S, + float * U, + const int * iu, + const int * ju, + const int * descu, + float * VT, + const int * ivt, + const int * jvt, + const int * descvt, + float * work, + int * lwork, + int * info) + { + psgesvd_(jobu, + jobvt, + m, + n, + A, + ia, + ja, + desca, + S, + U, + iu, + ju, + descu, + VT, + ivt, + jvt, + descvt, + work, + lwork, + info); + } + + + template + inline void + pgels(const char * /*trans*/, + const int * /*m*/, + const int * /*n*/, + const int * /*nrhs*/, + number * /*A*/, + const int * /*ia*/, + const int * /*ja*/, + const int * /*desca*/, + number * /*B*/, + const int * /*ib*/, + const int * /*jb*/, + const int * /*descb*/, + number * /*work*/, + int * /*lwork*/, + int * /*info*/) + { + AssertThrow(false, dealii::ExcNotImplemented()); + } + + inline void + pgels(const char *trans, + const int * m, + const int * n, + const int * nrhs, + double * A, + const int * ia, + const int * ja, + const int * desca, + double * B, + const int * ib, + const int * jb, + const int * descb, + double * work, + int * lwork, + int * info) + { + pdgels_( + trans, m, n, nrhs, A, ia, ja, desca, B, ib, jb, descb, work, lwork, info); + } + + inline void + pgels(const char *trans, + const int * m, + const int * n, + const int * nrhs, + float * A, + const int * ia, + const int * ja, + const int * desca, + float * B, + const int * ib, + const int * jb, + const int * descb, + float * work, + int * lwork, + int * info) + { + psgels_( + trans, m, n, nrhs, A, ia, ja, desca, B, ib, jb, descb, work, lwork, info); + } + + + template + inline void + pgeadd(const char * /*transa*/, + const int * /*m*/, + const int * /*n*/, + const number * /*alpha*/, + const number * /*A*/, + const int * /*IA*/, + const int * /*JA*/, + const int * /*DESCA*/, + const number * /*beta*/, + number * /*C*/, + const int * /*IC*/, + const int * /*JC*/, + const int * /*DESCC*/) + { + AssertThrow(false, dealii::ExcNotImplemented()); + } + + inline void + pgeadd(const char * transa, + const int * m, + const int * n, + const double *alpha, + const double *A, + const int * IA, + const int * JA, + const int * DESCA, + const double *beta, + double * C, + const int * IC, + const int * JC, + const int * DESCC) + { + pdgeadd_(transa, m, n, alpha, A, IA, JA, DESCA, beta, C, IC, JC, DESCC); + } + + inline void + pgeadd(const char * transa, + const int * m, + const int * n, + const float *alpha, + const float *A, + const int * IA, + const int * JA, + const int * DESCA, + const float *beta, + float * C, + const int * IC, + const int * JC, + const int * DESCC) + { + psgeadd_(transa, m, n, alpha, A, IA, JA, DESCA, beta, C, IC, JC, DESCC); + } + + inline void + pgeadd(const char * transa, + const int * m, + const int * n, + const std::complex *alpha, + const std::complex *A, + const int * IA, + const int * JA, + const int * DESCA, + const std::complex *beta, + std::complex * C, + const int * IC, + const int * JC, + const int * DESCC) + { + pzgeadd_(transa, m, n, alpha, A, IA, JA, DESCA, beta, C, IC, JC, DESCC); + } + + template + inline void + ptran(const int * /*m*/, + const int * /*n*/, + const number * /*alpha*/, + const number * /*A*/, + const int * /*IA*/, + const int * /*JA*/, + const int * /*DESCA*/, + const number * /*beta*/, + number * /*C*/, + const int * /*IC*/, + const int * /*JC*/, + const int * /*DESCC*/) + { + AssertThrow(false, dealii::ExcNotImplemented()); + } + + inline void + ptran(const int * m, + const int * n, + const double *alpha, + const double *A, + const int * IA, + const int * JA, + const int * DESCA, + const double *beta, + double * C, + const int * IC, + const int * JC, + const int * DESCC) + { + pdtran_(m, n, alpha, A, IA, JA, DESCA, beta, C, IC, JC, DESCC); + } + + inline void + ptran(const int * m, + const int * n, + const float *alpha, + const float *A, + const int * IA, + const int * JA, + const int * DESCA, + const float *beta, + float * C, + const int * IC, + const int * JC, + const int * DESCC) + { + pstran_(m, n, alpha, A, IA, JA, DESCA, beta, C, IC, JC, DESCC); + } + + + template + inline void + psyevr(const char * /*jobz*/, + const char * /*range*/, + const char * /*uplo*/, + const int * /*n*/, + number * /*A*/, + const int * /*IA*/, + const int * /*JA*/, + const int * /*DESCA*/, + const number * /*VL*/, + const number * /*VU*/, + const int * /*IL*/, + const int * /*IU*/, + int * /*m*/, + int * /*nz*/, + number * /*w*/, + number * /*Z*/, + const int * /*IZ*/, + const int * /*JZ*/, + const int * /*DESCZ*/, + number * /*work*/, + int * /*lwork*/, + int * /*iwork*/, + int * /*liwork*/, + int * /*info*/) + { + AssertThrow(false, dealii::ExcNotImplemented()); + } + + inline void + psyevr(const char * jobz, + const char * range, + const char * uplo, + const int * n, + double * A, + const int * IA, + const int * JA, + const int * DESCA, + const double *VL, + const double *VU, + const int * IL, + const int * IU, + int * m, + int * nz, + double * w, + double * Z, + const int * IZ, + const int * JZ, + const int * DESCZ, + double * work, + int * lwork, + int * iwork, + int * liwork, + int * info) + { + /* + * Netlib ScaLAPACK performs floating point tests (e.g. divide-by-zero) + * within the call to pdsyevr causing floating point exceptions to be thrown + * (at least in debug mode). Therefore, we wrap the calls to pdsyevr into + * the following code to suppress the exception. + */ +#ifdef DEAL_II_HAVE_FP_EXCEPTIONS + fenv_t fp_exceptions; + feholdexcept(&fp_exceptions); +#endif + + pdsyevr_(jobz, + range, + uplo, + n, + A, + IA, + JA, + DESCA, + VL, + VU, + IL, + IU, + m, + nz, + w, + Z, + IZ, + JZ, + DESCZ, + work, + lwork, + iwork, + liwork, + info); + +#ifdef DEAL_II_HAVE_FP_EXCEPTIONS + fesetenv(&fp_exceptions); +#endif + } + + inline void + psyevr(const char * jobz, + const char * range, + const char * uplo, + const int * n, + float * A, + const int * IA, + const int * JA, + const int * DESCA, + const float *VL, + const float *VU, + const int * IL, + const int * IU, + int * m, + int * nz, + float * w, + float * Z, + const int * IZ, + const int * JZ, + const int * DESCZ, + float * work, + int * lwork, + int * iwork, + int * liwork, + int * info) + { + /* + * Netlib ScaLAPACK performs floating point tests (e.g. divide-by-zero) + * within the call to pssyevr causing floating point exceptions to be thrown + * (at least in debug mode). Therefore, we wrap the calls to pssyevr into + * the following code to suppress the exception. + */ +#ifdef DEAL_II_HAVE_FP_EXCEPTIONS + fenv_t fp_exceptions; + feholdexcept(&fp_exceptions); +#endif + + pssyevr_(jobz, + range, + uplo, + n, + A, + IA, + JA, + DESCA, + VL, + VU, + IL, + IU, + m, + nz, + w, + Z, + IZ, + JZ, + DESCZ, + work, + lwork, + iwork, + liwork, + info); + +#ifdef DEAL_II_HAVE_FP_EXCEPTIONS + fesetenv(&fp_exceptions); +#endif + } + + inline void + psyevr(const char * jobz, + const char * range, + const char * uplo, + const int * n, + std::complex *A, + const int * IA, + const int * JA, + const int * DESCA, + const double * VL, + const double * VU, + const int * IL, + const int * IU, + int * m, + int * nz, + double * w, + std::complex *Z, + const int * IZ, + const int * JZ, + const int * DESCZ, + std::complex *work, + int * lwork, + int * iwork, + int * liwork, + int * info) + { + /* + * Netlib ScaLAPACK performs floating point tests (e.g. divide-by-zero) + * within the call to pdsyevr causing floating point exceptions to be thrown + * (at least in debug mode). Therefore, we wrap the calls to pdsyevr into + * the following code to suppress the exception. + */ +#ifdef DEAL_II_HAVE_FP_EXCEPTIONS + fenv_t fp_exceptions; + feholdexcept(&fp_exceptions); +#endif + + pzheevr_(jobz, + range, + uplo, + n, + A, + IA, + JA, + DESCA, + VL, + VU, + IL, + IU, + m, + nz, + w, + Z, + IZ, + JZ, + DESCZ, + work, + lwork, + iwork, + liwork, + info); + +#ifdef DEAL_II_HAVE_FP_EXCEPTIONS + fesetenv(&fp_exceptions); +#endif + } + + inline int + lworkFromWork(std::vector &work) + { + return static_cast(work[0]); + } + + inline int + lworkFromWork(std::vector &work) + { + return static_cast(work[0]); + } + + inline int + lworkFromWork(std::vector> &work) + { + return static_cast(work[0].real()); + } +} // namespace dftefeScalapack +#endif // dftfe_scalapack_templates_h diff --git a/src/linearAlgebra/DO_NOT_USE_ScalapckWrapper/scalapackHeaders.h b/src/linearAlgebra/DO_NOT_USE_ScalapckWrapper/scalapackHeaders.h new file mode 100644 index 000000000..a261533df --- /dev/null +++ b/src/linearAlgebra/DO_NOT_USE_ScalapckWrapper/scalapackHeaders.h @@ -0,0 +1,119 @@ +//// --------------------------------------------------------------------- +//// +//// Copyright (c) 2017-2022 The Regents of the University of Michigan and +/// DFT-FE / authors. +//// +//// This file is part of the DFT-FE code. +//// +//// The DFT-FE code is free software; you can use it, redistribute +//// it, and/or modify it under the terms of the GNU Lesser General +//// Public License as published by the Free Software Foundation; either +//// version 2.1 of the License, or (at your option) any later version. +//// The full text of the license can be found in the file LICENSE at +//// the top level of the DFT-FE distribution. +//// +//// --------------------------------------------------------------------- +//// +//// @author Shiva Rudraraju (2016), Phani Motamarri (2016), Sambit Das (2018) +//// +// +//#ifndef headers_H_ +//#define headers_H_ +// +//#ifndef DOXYGEN_SHOULD_SKIP_THIS +//// Include all deal.II header file +//# include +//# include +//# include +//# include +//# include +//# include +//# include +//# include +//# include +//# include +// +//# include +//# include +//# include +// +//# include +//# include +//# include +//# include +// +//# include +//# include +//# include +//# include +// +//# include +//# include +//# include +//# include +//# include +//# include +//# include +//# include +// +//# include +//# include +//# include +//# include +//# include +//# include +//# include +//# include +//# include +// +//# include +//# include +// +//# include +//# include +//# include +//# include +//# ifdef USE_PETSC +//# include +//# endif +//# include +// +//# include +//# include +// +//# if defined(DFTFE_WITH_GPU) +//# include +//# include +//# include +//# endif +// +//# include "dftfeDataTypes.h" +//# include "distributedMulticomponentVec.h" +// +//// Include generic C++ headers +//# include +//# include +//# include +// +//#endif /* DOXYGEN_SHOULD_SKIP_THIS */ +//// commonly used typedefs used in dftfe go here +// namespace dftfe +//{ +// template +// using distributedCPUVec = +// dealii::LinearAlgebra::distributed::Vector; +//#ifdef DFTFE_WITH_GPU +// // template +// // using distributedGPUVec = +// // dealii::LinearAlgebra::distributed::Vector; +// +// template +// using distributedGPUVec = +// dftfe::DistributedMulticomponentVec; +// +//#endif +//} // namespace dftfe +// +//#endif diff --git a/src/linearAlgebra/DO_NOT_USE_ScalapckWrapper/scalapackWrapper.cpp b/src/linearAlgebra/DO_NOT_USE_ScalapckWrapper/scalapackWrapper.cpp new file mode 100644 index 000000000..4a349ecd2 --- /dev/null +++ b/src/linearAlgebra/DO_NOT_USE_ScalapckWrapper/scalapackWrapper.cpp @@ -0,0 +1,1651 @@ +// --------------------------------------------------------------------- +// +// Copyright (c) 2019-2020 The Regents of the University of Michigan and DFT-FE +// authors. +// +// This file is part of the DFT-FE code. +// +// The DFT-FE code is free software; you can use it, redistribute +// it, and/or modify it under the terms of the GNU Lesser General +// Public License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// The full text of the license can be found in the file LICENSE at +// the top level of the DFT-FE distribution. +// +// --------------------------------------------------------------------- +// +// @author Sambit Das +// + +#include +#include +#include "scalapackWrapper.h" +#include "scalapack.templates.h" + +namespace dftefeScalapack +{ + template + ScaLAPACKMatrix::ScaLAPACKMatrix( + const size_type n_rows_, + const size_type n_columns_, + const std::shared_ptr &process_grid, + const size_type row_block_size_, + const size_type column_block_size_, + const dftefeScalapack::LAPACKSupport::Property property_) + : uplo('L') + , // for non-hermitian matrices this is not needed + first_process_row(0) + , first_process_column(0) + , submatrix_row(1) + , submatrix_column(1) + { + reinit(n_rows_, + n_columns_, + process_grid, + row_block_size_, + column_block_size_, + property_); + } + + + + template + ScaLAPACKMatrix::ScaLAPACKMatrix( + const size_type size, + const std::shared_ptr &process_grid, + const size_type block_size, + const dftefeScalapack::LAPACKSupport::Property property) + : ScaLAPACKMatrix(size, + size, + process_grid, + block_size, + block_size, + property) + {} + + + + template + void + ScaLAPACKMatrix::reinit( + const size_type n_rows_, + const size_type n_columns_, + const std::shared_ptr &process_grid, + const size_type row_block_size_, + const size_type column_block_size_, + const dftefeScalapack::LAPACKSupport::Property property_) + { + Assert(row_block_size_ > 0, + dealii::ExcMessage("Row block size has to be positive.")); + Assert(column_block_size_ > 0, + dealii::ExcMessage("Column block size has to be positive.")); + Assert( + row_block_size_ <= n_rows_, + dealii::ExcMessage( + "Row block size can not be greater than the number of rows of the matrix")); + Assert( + column_block_size_ <= n_columns_, + dealii::ExcMessage( + "Column block size can not be greater than the number of columns of the matrix")); + + state = dftefeScalapack::LAPACKSupport::State::matrix; + property = property_; + grid = process_grid; + n_rows = n_rows_; + n_columns = n_columns_; + row_block_size = row_block_size_; + column_block_size = column_block_size_; + + if (grid->mpi_process_is_active) + { + // Get local sizes: + n_local_rows = numroc_(&n_rows, + &row_block_size, + &(grid->this_process_row), + &first_process_row, + &(grid->n_process_rows)); + n_local_columns = numroc_(&n_columns, + &column_block_size, + &(grid->this_process_column), + &first_process_column, + &(grid->n_process_columns)); + + // LLD_A = MAX(1,NUMROC(M_A, MB_A, MYROW, RSRC_A, NPROW)), different + // between processes + int lda = std::max(1, n_local_rows); + + int info = 0; + descinit_(descriptor, + &n_rows, + &n_columns, + &row_block_size, + &column_block_size, + &first_process_row, + &first_process_column, + &(grid->blacs_context), + &lda, + &info); + AssertThrow(info == 0, + dftefeScalapack::LAPACKSupport::ExcErrorCode("descinit_", + info)); + + values.clear(); + values.resize(n_local_rows * n_local_columns, NumberType(0.0)); + // this->TransposeTable::reinit(n_local_rows, + // n_local_columns); + } + else + { + // set process-local variables to something telling: + n_local_rows = -1; + n_local_columns = -1; + std::fill(std::begin(descriptor), std::end(descriptor), -1); + } + } + + + + template + void + ScaLAPACKMatrix::reinit( + const size_type size, + const std::shared_ptr &process_grid, + const size_type block_size, + const dftefeScalapack::LAPACKSupport::Property property) + { + reinit(size, size, process_grid, block_size, block_size, property); + } + + template + void + ScaLAPACKMatrix::set_property( + const dftefeScalapack::LAPACKSupport::Property property_) + { + property = property_; + } + + + + template + dftefeScalapack::LAPACKSupport::Property + ScaLAPACKMatrix::get_property() const + { + return property; + } + + + template + dftefeScalapack::LAPACKSupport::State + ScaLAPACKMatrix::get_state() const + { + return state; + } + + + template + unsigned int + ScaLAPACKMatrix::global_row(const unsigned int loc_row) const + { + Assert(n_local_rows >= 0 && + loc_row < static_cast(n_local_rows), + dealii::ExcIndexRange(loc_row, 0, n_local_rows)); + const int i = loc_row + 1; + return indxl2g_(&i, + &row_block_size, + &(grid->this_process_row), + &first_process_row, + &(grid->n_process_rows)) - + 1; + } + + + + template + unsigned int + ScaLAPACKMatrix::global_column( + const unsigned int loc_column) const + { + Assert(n_local_columns >= 0 && + loc_column < static_cast(n_local_columns), + dealii::ExcIndexRange(loc_column, 0, n_local_columns)); + const int j = loc_column + 1; + return indxl2g_(&j, + &column_block_size, + &(grid->this_process_column), + &first_process_column, + &(grid->n_process_columns)) - + 1; + } + + template + void + ScaLAPACKMatrix::conjugate() + { + if (std::is_same>::value) + { + if (this->grid->mpi_process_is_active) + { + NumberType *A_loc = + (this->values.size() > 0) ? this->values.data() : nullptr; + const int totalSize = n_rows * n_columns; + const int incx = 1; + pplacgv(&totalSize, + A_loc, + &submatrix_row, + &submatrix_column, + descriptor, + &incx); + } + } + } + + + template + void + ScaLAPACKMatrix::copy_to(ScaLAPACKMatrix &dest) const + { + Assert(n_rows == dest.n_rows, + dealii::ExcDimensionMismatch(n_rows, dest.n_rows)); + Assert(n_columns == dest.n_columns, + dealii::ExcDimensionMismatch(n_columns, dest.n_columns)); + + if (this->grid->mpi_process_is_active) + AssertThrow( + this->descriptor[0] == 1, + dealii::ExcMessage( + "Copying of ScaLAPACK matrices only implemented for dense matrices")); + if (dest.grid->mpi_process_is_active) + AssertThrow( + dest.descriptor[0] == 1, + dealii::ExcMessage( + "Copying of ScaLAPACK matrices only implemented for dense matrices")); + + /* + * just in case of different process grids or block-cyclic distributions + * inter-process communication is necessary + * if distributed matrices have the same process grid and block sizes, local + * copying is enough + */ + if ((this->grid != dest.grid) || (row_block_size != dest.row_block_size) || + (column_block_size != dest.column_block_size)) + { + /* + * get the MPI communicator, which is the union of the source and + * destination MPI communicator + */ + int ierr = 0; + MPI_Group group_source, group_dest, group_union; + ierr = MPI_Comm_group(this->grid->mpi_communicator, &group_source); + AssertThrowMPI(ierr); + ierr = MPI_Comm_group(dest.grid->mpi_communicator, &group_dest); + AssertThrowMPI(ierr); + ierr = MPI_Group_union(group_source, group_dest, &group_union); + AssertThrowMPI(ierr); + MPI_Comm mpi_communicator_union; + + // to create a communicator representing the union of the source + // and destination MPI communicator we need a communicator that + // is guaranteed to contain all desired processes -- i.e., + // MPI_COMM_WORLD. on the other hand, as documented in the MPI + // standard, MPI_Comm_create_group is not collective on all + // processes in the first argument, but instead is collective on + // only those processes listed in the group. in other words, + // there is really no harm in passing MPI_COMM_WORLD as the + // first argument, even if the program we are currently running + // and that is calling this function only works on a subset of + // processes. the same holds for the wrapper/fallback we are using here. + + const int mpi_tag = + dealii::Utilities::MPI::internal::Tags::scalapack_copy_to2; + ierr = dealii::Utilities::MPI::create_group(MPI_COMM_WORLD, + group_union, + mpi_tag, + &mpi_communicator_union); + AssertThrowMPI(ierr); + + /* + * The routine pgemr2d requires a BLACS context resembling at least the + * union of process grids described by the BLACS contexts of matrix A + * and + * B + */ + int union_blacs_context = Csys2blacs_handle(mpi_communicator_union); + const char *order = "Col"; + int union_n_process_rows = + dealii::Utilities::MPI::n_mpi_processes(mpi_communicator_union); + int union_n_process_columns = 1; + Cblacs_gridinit(&union_blacs_context, + order, + union_n_process_rows, + union_n_process_columns); + + const NumberType *loc_vals_source = nullptr; + NumberType * loc_vals_dest = nullptr; + + if (this->grid->mpi_process_is_active && (this->values.size() > 0)) + { + AssertThrow(this->values.size() > 0, + dealii::ExcMessage( + "source: process is active but local matrix empty")); + loc_vals_source = this->values.data(); + } + if (dest.grid->mpi_process_is_active && (dest.values.size() > 0)) + { + AssertThrow( + dest.values.size() > 0, + dealii::ExcMessage( + "destination: process is active but local matrix empty")); + loc_vals_dest = dest.values.data(); + } + pgemr2d(&n_rows, + &n_columns, + loc_vals_source, + &submatrix_row, + &submatrix_column, + descriptor, + loc_vals_dest, + &dest.submatrix_row, + &dest.submatrix_column, + dest.descriptor, + &union_blacs_context); + + Cblacs_gridexit(union_blacs_context); + + if (mpi_communicator_union != MPI_COMM_NULL) + { + ierr = MPI_Comm_free(&mpi_communicator_union); + AssertThrowMPI(ierr); + } + ierr = MPI_Group_free(&group_source); + AssertThrowMPI(ierr); + ierr = MPI_Group_free(&group_dest); + AssertThrowMPI(ierr); + ierr = MPI_Group_free(&group_union); + AssertThrowMPI(ierr); + } + else + // process is active in the process grid + if (this->grid->mpi_process_is_active) + dest.values = this->values; + + dest.state = state; + dest.property = property; + } + + template + void + ScaLAPACKMatrix::add(const ScaLAPACKMatrix &B, + const NumberType alpha, + const NumberType beta, + const bool transpose_B) + { + if (transpose_B) + { + Assert(n_rows == B.n_columns, + dealii::ExcDimensionMismatch(n_rows, B.n_columns)); + Assert(n_columns == B.n_rows, + dealii::ExcDimensionMismatch(n_columns, B.n_rows)); + Assert(column_block_size == B.row_block_size, + dealii::ExcDimensionMismatch(column_block_size, + B.row_block_size)); + Assert(row_block_size == B.column_block_size, + dealii::ExcDimensionMismatch(row_block_size, + B.column_block_size)); + } + else + { + Assert(n_rows == B.n_rows, + dealii::ExcDimensionMismatch(n_rows, B.n_rows)); + Assert(n_columns == B.n_columns, + dealii::ExcDimensionMismatch(n_columns, B.n_columns)); + Assert(column_block_size == B.column_block_size, + dealii::ExcDimensionMismatch(column_block_size, + B.column_block_size)); + Assert(row_block_size == B.row_block_size, + dealii::ExcDimensionMismatch(row_block_size, B.row_block_size)); + } + Assert(this->grid == B.grid, + dealii::ExcMessage( + "The matrices A and B need to have the same process grid")); + + if (this->grid->mpi_process_is_active) + { + char trans_b = transpose_B ? 'T' : 'N'; + NumberType *A_loc = + (this->values.size() > 0) ? this->values.data() : nullptr; + const NumberType *B_loc = + (B.values.size() > 0) ? B.values.data() : nullptr; + + pgeadd(&trans_b, + &n_rows, + &n_columns, + &beta, + B_loc, + &B.submatrix_row, + &B.submatrix_column, + B.descriptor, + &alpha, + A_loc, + &submatrix_row, + &submatrix_column, + descriptor); + } + state = dftefeScalapack::LAPACKSupport::matrix; + } + + template + void + ScaLAPACKMatrix::zadd(const ScaLAPACKMatrix &B, + const NumberType alpha, + const NumberType beta, + const bool conjugate_transpose_B) + { + if (conjugate_transpose_B) + { + Assert(n_rows == B.n_columns, + dealii::ExcDimensionMismatch(n_rows, B.n_columns)); + Assert(n_columns == B.n_rows, + dealii::ExcDimensionMismatch(n_columns, B.n_rows)); + Assert(column_block_size == B.row_block_size, + dealii::ExcDimensionMismatch(column_block_size, + B.row_block_size)); + Assert(row_block_size == B.column_block_size, + dealii::ExcDimensionMismatch(row_block_size, + B.column_block_size)); + } + else + { + Assert(n_rows == B.n_rows, + dealii::ExcDimensionMismatch(n_rows, B.n_rows)); + Assert(n_columns == B.n_columns, + dealii::ExcDimensionMismatch(n_columns, B.n_columns)); + Assert(column_block_size == B.column_block_size, + dealii::ExcDimensionMismatch(column_block_size, + B.column_block_size)); + Assert(row_block_size == B.row_block_size, + dealii::ExcDimensionMismatch(row_block_size, B.row_block_size)); + } + Assert(this->grid == B.grid, + dealii::ExcMessage( + "The matrices A and B need to have the same process grid")); + + if (this->grid->mpi_process_is_active) + { + char trans_b = + conjugate_transpose_B ? + (std::is_same>::value ? 'C' : + 'T') : + 'N'; + NumberType *A_loc = + (this->values.size() > 0) ? this->values.data() : nullptr; + const NumberType *B_loc = + (B.values.size() > 0) ? B.values.data() : nullptr; + + pgeadd(&trans_b, + &n_rows, + &n_columns, + &beta, + B_loc, + &B.submatrix_row, + &B.submatrix_column, + B.descriptor, + &alpha, + A_loc, + &submatrix_row, + &submatrix_column, + descriptor); + } + state = dftefeScalapack::LAPACKSupport::matrix; + } + + template + void + ScaLAPACKMatrix::copy_transposed( + const ScaLAPACKMatrix &B) + { + add(B, 0, 1, true); + } + + template + void + ScaLAPACKMatrix::copy_conjugate_transposed( + const ScaLAPACKMatrix &B) + { + zadd(B, 0, 1, true); + } + + template + void + ScaLAPACKMatrix::mult(const NumberType b, + const ScaLAPACKMatrix &B, + const NumberType c, + ScaLAPACKMatrix & C, + const bool transpose_A, + const bool transpose_B) const + { + Assert(this->grid == B.grid, + dealii::ExcMessage( + "The matrices A and B need to have the same process grid")); + Assert(C.grid == B.grid, + dealii::ExcMessage( + "The matrices B and C need to have the same process grid")); + + // see for further info: + // https://www.ibm.com/support/knowledgecenter/SSNR5K_4.2.0/com.ibm.cluster.pessl.v4r2.pssl100.doc/am6gr_lgemm.htm + if (!transpose_A && !transpose_B) + { + Assert(this->n_columns == B.n_rows, + dealii::ExcDimensionMismatch(this->n_columns, B.n_rows)); + Assert(this->n_rows == C.n_rows, + dealii::ExcDimensionMismatch(this->n_rows, C.n_rows)); + Assert(B.n_columns == C.n_columns, + dealii::ExcDimensionMismatch(B.n_columns, C.n_columns)); + Assert(this->row_block_size == C.row_block_size, + dealii::ExcDimensionMismatch(this->row_block_size, + C.row_block_size)); + Assert(this->column_block_size == B.row_block_size, + dealii::ExcDimensionMismatch(this->column_block_size, + B.row_block_size)); + Assert(B.column_block_size == C.column_block_size, + dealii::ExcDimensionMismatch(B.column_block_size, + C.column_block_size)); + } + else if (transpose_A && !transpose_B) + { + Assert(this->n_rows == B.n_rows, + dealii::ExcDimensionMismatch(this->n_rows, B.n_rows)); + Assert(this->n_columns == C.n_rows, + dealii::ExcDimensionMismatch(this->n_columns, C.n_rows)); + Assert(B.n_columns == C.n_columns, + dealii::ExcDimensionMismatch(B.n_columns, C.n_columns)); + Assert(this->column_block_size == C.row_block_size, + dealii::ExcDimensionMismatch(this->column_block_size, + C.row_block_size)); + Assert(this->row_block_size == B.row_block_size, + dealii::ExcDimensionMismatch(this->row_block_size, + B.row_block_size)); + Assert(B.column_block_size == C.column_block_size, + dealii::ExcDimensionMismatch(B.column_block_size, + C.column_block_size)); + } + else if (!transpose_A && transpose_B) + { + Assert(this->n_columns == B.n_columns, + dealii::ExcDimensionMismatch(this->n_columns, B.n_columns)); + Assert(this->n_rows == C.n_rows, + dealii::ExcDimensionMismatch(this->n_rows, C.n_rows)); + Assert(B.n_rows == C.n_columns, + dealii::ExcDimensionMismatch(B.n_rows, C.n_columns)); + Assert(this->row_block_size == C.row_block_size, + dealii::ExcDimensionMismatch(this->row_block_size, + C.row_block_size)); + Assert(this->column_block_size == B.column_block_size, + dealii::ExcDimensionMismatch(this->column_block_size, + B.column_block_size)); + Assert(B.row_block_size == C.column_block_size, + dealii::ExcDimensionMismatch(B.row_block_size, + C.column_block_size)); + } + else // if (transpose_A && transpose_B) + { + Assert(this->n_rows == B.n_columns, + dealii::ExcDimensionMismatch(this->n_rows, B.n_columns)); + Assert(this->n_columns == C.n_rows, + dealii::ExcDimensionMismatch(this->n_columns, C.n_rows)); + Assert(B.n_rows == C.n_columns, + dealii::ExcDimensionMismatch(B.n_rows, C.n_columns)); + Assert(this->column_block_size == C.row_block_size, + dealii::ExcDimensionMismatch(this->row_block_size, + C.row_block_size)); + Assert(this->row_block_size == B.column_block_size, + dealii::ExcDimensionMismatch(this->column_block_size, + B.row_block_size)); + Assert(B.row_block_size == C.column_block_size, + dealii::ExcDimensionMismatch(B.column_block_size, + C.column_block_size)); + } + + if (this->grid->mpi_process_is_active) + { + char trans_a = transpose_A ? 'T' : 'N'; + char trans_b = transpose_B ? 'T' : 'N'; + + const NumberType *A_loc = + (this->values.size() > 0) ? this->values.data() : nullptr; + const NumberType *B_loc = + (B.values.size() > 0) ? B.values.data() : nullptr; + NumberType *C_loc = (C.values.size() > 0) ? C.values.data() : nullptr; + int m = C.n_rows; + int n = C.n_columns; + int k = transpose_A ? this->n_rows : this->n_columns; + + pgemm(&trans_a, + &trans_b, + &m, + &n, + &k, + &b, + A_loc, + &(this->submatrix_row), + &(this->submatrix_column), + this->descriptor, + B_loc, + &B.submatrix_row, + &B.submatrix_column, + B.descriptor, + &c, + C_loc, + &C.submatrix_row, + &C.submatrix_column, + C.descriptor); + } + C.state = dftefeScalapack::LAPACKSupport::matrix; + } + + template + void + ScaLAPACKMatrix::zmult(const NumberType b, + const ScaLAPACKMatrix &B, + const NumberType c, + ScaLAPACKMatrix & C, + const bool conjugate_transpose_A, + const bool conjugate_transpose_B) const + { + Assert(this->grid == B.grid, + dealii::ExcMessage( + "The matrices A and B need to have the same process grid")); + Assert(C.grid == B.grid, + dealii::ExcMessage( + "The matrices B and C need to have the same process grid")); + + // see for further info: + // https://www.ibm.com/support/knowledgecenter/SSNR5K_4.2.0/com.ibm.cluster.pessl.v4r2.pssl100.doc/am6gr_lgemm.htm + if (!conjugate_transpose_A && !conjugate_transpose_B) + { + Assert(this->n_columns == B.n_rows, + dealii::ExcDimensionMismatch(this->n_columns, B.n_rows)); + Assert(this->n_rows == C.n_rows, + dealii::ExcDimensionMismatch(this->n_rows, C.n_rows)); + Assert(B.n_columns == C.n_columns, + dealii::ExcDimensionMismatch(B.n_columns, C.n_columns)); + Assert(this->row_block_size == C.row_block_size, + dealii::ExcDimensionMismatch(this->row_block_size, + C.row_block_size)); + Assert(this->column_block_size == B.row_block_size, + dealii::ExcDimensionMismatch(this->column_block_size, + B.row_block_size)); + Assert(B.column_block_size == C.column_block_size, + dealii::ExcDimensionMismatch(B.column_block_size, + C.column_block_size)); + } + else if (conjugate_transpose_A && !conjugate_transpose_B) + { + Assert(this->n_rows == B.n_rows, + dealii::ExcDimensionMismatch(this->n_rows, B.n_rows)); + Assert(this->n_columns == C.n_rows, + dealii::ExcDimensionMismatch(this->n_columns, C.n_rows)); + Assert(B.n_columns == C.n_columns, + dealii::ExcDimensionMismatch(B.n_columns, C.n_columns)); + Assert(this->column_block_size == C.row_block_size, + dealii::ExcDimensionMismatch(this->column_block_size, + C.row_block_size)); + Assert(this->row_block_size == B.row_block_size, + dealii::ExcDimensionMismatch(this->row_block_size, + B.row_block_size)); + Assert(B.column_block_size == C.column_block_size, + dealii::ExcDimensionMismatch(B.column_block_size, + C.column_block_size)); + } + else if (!conjugate_transpose_A && conjugate_transpose_B) + { + Assert(this->n_columns == B.n_columns, + dealii::ExcDimensionMismatch(this->n_columns, B.n_columns)); + Assert(this->n_rows == C.n_rows, + dealii::ExcDimensionMismatch(this->n_rows, C.n_rows)); + Assert(B.n_rows == C.n_columns, + dealii::ExcDimensionMismatch(B.n_rows, C.n_columns)); + Assert(this->row_block_size == C.row_block_size, + dealii::ExcDimensionMismatch(this->row_block_size, + C.row_block_size)); + Assert(this->column_block_size == B.column_block_size, + dealii::ExcDimensionMismatch(this->column_block_size, + B.column_block_size)); + Assert(B.row_block_size == C.column_block_size, + dealii::ExcDimensionMismatch(B.row_block_size, + C.column_block_size)); + } + else // if (transpose_A && transpose_B) + { + Assert(this->n_rows == B.n_columns, + dealii::ExcDimensionMismatch(this->n_rows, B.n_columns)); + Assert(this->n_columns == C.n_rows, + dealii::ExcDimensionMismatch(this->n_columns, C.n_rows)); + Assert(B.n_rows == C.n_columns, + dealii::ExcDimensionMismatch(B.n_rows, C.n_columns)); + Assert(this->column_block_size == C.row_block_size, + dealii::ExcDimensionMismatch(this->row_block_size, + C.row_block_size)); + Assert(this->row_block_size == B.column_block_size, + dealii::ExcDimensionMismatch(this->column_block_size, + B.row_block_size)); + Assert(B.row_block_size == C.column_block_size, + dealii::ExcDimensionMismatch(B.column_block_size, + C.column_block_size)); + } + + if (this->grid->mpi_process_is_active) + { + char trans_a = + conjugate_transpose_A ? + (std::is_same>::value ? 'C' : + 'T') : + 'N'; + char trans_b = + conjugate_transpose_B ? + (std::is_same>::value ? 'C' : + 'T') : + 'N'; + + const NumberType *A_loc = + (this->values.size() > 0) ? this->values.data() : nullptr; + const NumberType *B_loc = + (B.values.size() > 0) ? B.values.data() : nullptr; + NumberType *C_loc = (C.values.size() > 0) ? C.values.data() : nullptr; + int m = C.n_rows; + int n = C.n_columns; + int k = conjugate_transpose_A ? this->n_rows : this->n_columns; + + pgemm(&trans_a, + &trans_b, + &m, + &n, + &k, + &b, + A_loc, + &(this->submatrix_row), + &(this->submatrix_column), + this->descriptor, + B_loc, + &B.submatrix_row, + &B.submatrix_column, + B.descriptor, + &c, + C_loc, + &C.submatrix_row, + &C.submatrix_column, + C.descriptor); + } + C.state = dftefeScalapack::LAPACKSupport::matrix; + } + + template + void + ScaLAPACKMatrix::mmult(ScaLAPACKMatrix & C, + const ScaLAPACKMatrix &B, + const bool adding) const + { + if (adding) + mult(1., B, 1., C, false, false); + else + mult(1., B, 0, C, false, false); + } + + + + template + void + ScaLAPACKMatrix::Tmmult(ScaLAPACKMatrix & C, + const ScaLAPACKMatrix &B, + const bool adding) const + { + if (adding) + mult(1., B, 1., C, true, false); + else + mult(1., B, 0, C, true, false); + } + + + + template + void + ScaLAPACKMatrix::mTmult(ScaLAPACKMatrix & C, + const ScaLAPACKMatrix &B, + const bool adding) const + { + if (adding) + mult(1., B, 1., C, false, true); + else + mult(1., B, 0, C, false, true); + } + + + + template + void + ScaLAPACKMatrix::TmTmult(ScaLAPACKMatrix & C, + const ScaLAPACKMatrix &B, + const bool adding) const + { + if (adding) + mult(1., B, 1., C, true, true); + else + mult(1., B, 0, C, true, true); + } + + + template + void + ScaLAPACKMatrix::zmmult(ScaLAPACKMatrix & C, + const ScaLAPACKMatrix &B, + const bool adding) const + { + if (adding) + zmult(1., B, 1., C, false, false); + else + zmult(1., B, 0, C, false, false); + } + + + + template + void + ScaLAPACKMatrix::zCmmult(ScaLAPACKMatrix & C, + const ScaLAPACKMatrix &B, + const bool adding) const + { + if (adding) + zmult(1., B, 1., C, true, false); + else + zmult(1., B, 0, C, true, false); + } + + + + template + void + ScaLAPACKMatrix::zmCmult(ScaLAPACKMatrix & C, + const ScaLAPACKMatrix &B, + const bool adding) const + { + if (adding) + zmult(1., B, 1., C, false, true); + else + zmult(1., B, 0, C, false, true); + } + + + + template + void + ScaLAPACKMatrix::zCmCmult(ScaLAPACKMatrix & C, + const ScaLAPACKMatrix &B, + const bool adding) const + { + if (adding) + zmult(1., B, 1., C, true, true); + else + zmult(1., B, 0, C, true, true); + } + + template + void + ScaLAPACKMatrix::compute_cholesky_factorization() + { + Assert( + n_columns == n_rows && property == LAPACKSupport::Property::hermitian, + dealii::ExcMessage( + "Cholesky factorization can be applied to hermitian matrices only.")); + Assert(state == LAPACKSupport::matrix, + dealii::ExcMessage( + "Matrix has to be in Matrix state before calling this function.")); + + if (grid->mpi_process_is_active) + { + int info = 0; + NumberType *A_loc = this->values.data(); + // pdpotrf_(&uplo,&n_columns,A_loc,&submatrix_row,&submatrix_column,descriptor,&info); + ppotrf(&uplo, + &n_columns, + A_loc, + &submatrix_row, + &submatrix_column, + descriptor, + &info); + AssertThrow(info == 0, + dftefeScalapack::LAPACKSupport::ExcErrorCode("ppotrf", + info)); + } + state = dftefeScalapack::LAPACKSupport::cholesky; + property = (uplo == 'L' ? dftefeScalapack::LAPACKSupport::lower_triangular : + dftefeScalapack::LAPACKSupport::upper_triangular); + } + + template + void + ScaLAPACKMatrix::compute_lu_factorization() + { + Assert(state == LAPACKSupport::matrix, + dealii::ExcMessage( + "Matrix has to be in Matrix state before calling this function.")); + + if (grid->mpi_process_is_active) + { + int info = 0; + NumberType *A_loc = this->values.data(); + + const int iarow = indxg2p_(&submatrix_row, + &row_block_size, + &(grid->this_process_row), + &first_process_row, + &(grid->n_process_rows)); + const int mp = numroc_(&n_rows, + &row_block_size, + &(grid->this_process_row), + &iarow, + &(grid->n_process_rows)); + ipiv.resize(mp + row_block_size); + + pgetrf(&n_rows, + &n_columns, + A_loc, + &submatrix_row, + &submatrix_column, + descriptor, + ipiv.data(), + &info); + AssertThrow(info == 0, + dftefeScalapack::LAPACKSupport::ExcErrorCode("pgetrf", + info)); + } + state = dftefeScalapack::LAPACKSupport::State::lu; + property = dftefeScalapack::LAPACKSupport::Property::general; + } + + template + void + ScaLAPACKMatrix::invert() + { + // Check whether matrix is hermitian and save flag. + // If a Cholesky factorization has been applied previously, + // the original matrix was hermitian. + const bool is_hermitian = + (property == dftefeScalapack::LAPACKSupport::hermitian || + state == dftefeScalapack::LAPACKSupport::State::cholesky); + + // Check whether matrix is triangular and is in an unfactorized state. + const bool is_triangular = + (property == dftefeScalapack::LAPACKSupport::upper_triangular || + property == dftefeScalapack::LAPACKSupport::lower_triangular) && + (state == dftefeScalapack::LAPACKSupport::State::matrix || + state == dftefeScalapack::LAPACKSupport::State::inverse_matrix); + + if (is_triangular) + { + if (grid->mpi_process_is_active) + { + const char uploTriangular = + property == dftefeScalapack::LAPACKSupport::upper_triangular ? + 'U' : + 'L'; + const char diag = 'N'; + int info = 0; + NumberType *A_loc = this->values.data(); + ptrtri(&uploTriangular, + &diag, + &n_columns, + A_loc, + &submatrix_row, + &submatrix_column, + descriptor, + &info); + AssertThrow(info == 0, + dftefeScalapack::LAPACKSupport::ExcErrorCode("ptrtri", + info)); + // The inversion is stored in the same part as the triangular + // matrix, so we don't need to re-set the property here. + } + } + else + { + // Matrix is neither in Cholesky nor LU state. + // Compute the required factorizations based on the property of the + // matrix. + if (!(state == dftefeScalapack::LAPACKSupport::State::lu || + state == dftefeScalapack::LAPACKSupport::State::cholesky)) + { + if (is_hermitian) + compute_cholesky_factorization(); + else + compute_lu_factorization(); + } + if (grid->mpi_process_is_active) + { + int info = 0; + NumberType *A_loc = this->values.data(); + + if (is_hermitian) + { + ppotri(&uplo, + &n_columns, + A_loc, + &submatrix_row, + &submatrix_column, + descriptor, + &info); + AssertThrow( + info == 0, + dftefeScalapack::LAPACKSupport::ExcErrorCode("ppotri", info)); + property = dftefeScalapack::LAPACKSupport::Property::hermitian; + } + else + { + int lwork = -1, liwork = -1; + work.resize(1); + iwork.resize(1); + + pgetri(&n_columns, + A_loc, + &submatrix_row, + &submatrix_column, + descriptor, + ipiv.data(), + work.data(), + &lwork, + iwork.data(), + &liwork, + &info); + + AssertThrow( + info == 0, + dftefeScalapack::LAPACKSupport::ExcErrorCode("pgetri", info)); + lwork = lworkFromWork(work); + liwork = iwork[0]; + work.resize(lwork); + iwork.resize(liwork); + + pgetri(&n_columns, + A_loc, + &submatrix_row, + &submatrix_column, + descriptor, + ipiv.data(), + work.data(), + &lwork, + iwork.data(), + &liwork, + &info); + + AssertThrow( + info == 0, + dftefeScalapack::LAPACKSupport::ExcErrorCode("pgetri", info)); + } + } + } + state = dftefeScalapack::LAPACKSupport::State::inverse_matrix; + } + + template + std::vector + ScaLAPACKMatrix::eigenpairs_hermitian_by_index( + const std::pair &index_limits, + const bool compute_eigenvectors) + { + // check validity of index limits + AssertIndexRange(index_limits.first, n_rows); + AssertIndexRange(index_limits.second, n_rows); + + std::pair idx = + std::make_pair(std::min(index_limits.first, index_limits.second), + std::max(index_limits.first, index_limits.second)); + + // compute all eigenvalues/eigenvectors + if (idx.first == 0 && idx.second == static_cast(n_rows - 1)) + return eigenpairs_hermitian(compute_eigenvectors); + else + return eigenpairs_hermitian(compute_eigenvectors, idx); + } + + + template + std::vector + ScaLAPACKMatrix::eigenpairs_hermitian( + const bool compute_eigenvectors, + const std::pair &eigenvalue_idx, + const std::pair & eigenvalue_limits) + { + Assert(state == dftefeScalapack::LAPACKSupport::matrix, + dealii::ExcMessage( + "Matrix has to be in Matrix state before calling this function.")); + Assert(property == dftefeScalapack::LAPACKSupport::hermitian, + dealii::ExcMessage( + "Matrix has to be hermitian for this operation.")); + + std::lock_guard lock(mutex); + + const bool use_values = (std::isnan(eigenvalue_limits.first) || + std::isnan(eigenvalue_limits.second)) ? + false : + true; + const bool use_indices = + ((eigenvalue_idx.first == dealii::numbers::invalid_unsigned_int) || + (eigenvalue_idx.second == dealii::numbers::invalid_unsigned_int)) ? + false : + true; + + Assert( + !(use_values && use_indices), + dealii::ExcMessage( + "Prescribing both the index and value range for the eigenvalues is ambiguous")); + + // if computation of eigenvectors is not required use a sufficiently small + // distributed matrix + std::unique_ptr> eigenvectors = + compute_eigenvectors ? + std::make_unique>(n_rows, + grid, + row_block_size) : + std::make_unique>( + grid->n_process_rows, grid->n_process_columns, grid, 1, 1); + + eigenvectors->property = property; + // number of eigenvalues to be returned from psyevx; upon successful exit ev + // contains the m seclected eigenvalues in ascending order set to all + // eigenvaleus in case we will be using psyev. + int m = n_rows; + std::vector ev(n_rows); + + if (grid->mpi_process_is_active) + { + int info = 0; + /* + * for jobz==N only eigenvalues are computed, for jobz='V' also the + * eigenvectors of the matrix are computed + */ + char jobz = compute_eigenvectors ? 'V' : 'N'; + char range = 'A'; + // default value is to compute all eigenvalues and optionally + // eigenvectors + bool all_eigenpairs = true; + double vl = 0.0; + double vu = 0.0; + int il = 1, iu = 1; + // number of eigenvectors to be returned; + // upon successful exit the first m=nz columns contain the selected + // eigenvectors (only if jobz=='V') + int nz = 0; + double abstol = 0.0; + + // orfac decides which eigenvectors should be reorthogonalized + // see + // http://www.netlib.org/scalapack/explore-html/df/d1a/pdsyevx_8f_source.html + // for explanation to keeps simple no reorthogonalized will be done by + // setting orfac to 0 + double orfac = 0; + // contains the indices of eigenvectors that failed to converge + std::vector ifail; + // This array contains indices of eigenvectors corresponding to + // a cluster of eigenvalues that could not be reorthogonalized + // due to insufficient workspace + // see + // http://www.netlib.org/scalapack/explore-html/df/d1a/pdsyevx_8f_source.html + // for explanation + std::vector iclustr; + // This array contains the gap between eigenvalues whose + // eigenvectors could not be reorthogonalized. + // see + // http://www.netlib.org/scalapack/explore-html/df/d1a/pdsyevx_8f_source.html + // for explanation + std::vector gap(n_local_rows * n_local_columns); + + // index range for eigenvalues is not specified + if (!use_indices) + { + // interval for eigenvalues is not specified and consequently all + // eigenvalues/eigenpairs will be computed + if (!use_values) + { + range = 'A'; + all_eigenpairs = true; + } + else + { + range = 'V'; + all_eigenpairs = false; + vl = + std::min(eigenvalue_limits.first, eigenvalue_limits.second); + vu = + std::max(eigenvalue_limits.first, eigenvalue_limits.second); + } + } + else + { + range = 'I'; + all_eigenpairs = false; + // as Fortran starts counting/indexing from 1 unlike C/C++, where it + // starts from 0 + il = std::min(eigenvalue_idx.first, eigenvalue_idx.second) + 1; + iu = std::max(eigenvalue_idx.first, eigenvalue_idx.second) + 1; + } + NumberType *A_loc = this->values.data(); + /* + * by setting lwork to -1 a workspace query for optimal length of work + * is performed + */ + int lwork = -1; + int liwork = -1; + NumberType *eigenvectors_loc = + (compute_eigenvectors ? eigenvectors->values.data() : nullptr); + work.resize(1); + iwork.resize(1); + + if (all_eigenpairs) + { + psyev(&jobz, + &uplo, + &n_rows, + A_loc, + &submatrix_row, + &submatrix_column, + descriptor, + ev.data(), + eigenvectors_loc, + &eigenvectors->submatrix_row, + &eigenvectors->submatrix_column, + eigenvectors->descriptor, + work.data(), + &lwork, + &info); + AssertThrow(info == 0, + dftefeScalapack::LAPACKSupport::ExcErrorCode("psyev", + info)); + } + else + { + char cmach = compute_eigenvectors ? 'U' : 'S'; + plamch(&(this->grid->blacs_context), &cmach, abstol); + abstol *= 2; + ifail.resize(n_rows); + iclustr.resize(2 * grid->n_process_rows * grid->n_process_columns); + gap.resize(grid->n_process_rows * grid->n_process_columns); + + psyevx(&jobz, + &range, + &uplo, + &n_rows, + A_loc, + &submatrix_row, + &submatrix_column, + descriptor, + &vl, + &vu, + &il, + &iu, + &abstol, + &m, + &nz, + ev.data(), + &orfac, + eigenvectors_loc, + &eigenvectors->submatrix_row, + &eigenvectors->submatrix_column, + eigenvectors->descriptor, + work.data(), + &lwork, + iwork.data(), + &liwork, + ifail.data(), + iclustr.data(), + gap.data(), + &info); + AssertThrow(info == 0, + dftefeScalapack::LAPACKSupport::ExcErrorCode("psyevx", + info)); + } + lwork = lworkFromWork(work); + work.resize(lwork); + + if (all_eigenpairs) + { + psyev(&jobz, + &uplo, + &n_rows, + A_loc, + &submatrix_row, + &submatrix_column, + descriptor, + ev.data(), + eigenvectors_loc, + &eigenvectors->submatrix_row, + &eigenvectors->submatrix_column, + eigenvectors->descriptor, + work.data(), + &lwork, + &info); + + AssertThrow(info == 0, + dftefeScalapack::LAPACKSupport::ExcErrorCode("psyev", + info)); + } + else + { + liwork = iwork[0]; + AssertThrow(liwork > 0, dealii::ExcInternalError()); + iwork.resize(liwork); + + psyevx(&jobz, + &range, + &uplo, + &n_rows, + A_loc, + &submatrix_row, + &submatrix_column, + descriptor, + &vl, + &vu, + &il, + &iu, + &abstol, + &m, + &nz, + ev.data(), + &orfac, + eigenvectors_loc, + &eigenvectors->submatrix_row, + &eigenvectors->submatrix_column, + eigenvectors->descriptor, + work.data(), + &lwork, + iwork.data(), + &liwork, + ifail.data(), + iclustr.data(), + gap.data(), + &info); + + AssertThrow(info == 0, + dftefeScalapack::LAPACKSupport::ExcErrorCode("psyevx", + info)); + } + // if eigenvectors are queried copy eigenvectors to original matrix + // as the temporary matrix eigenvectors has identical dimensions and + // block-cyclic distribution we simply swap the local array + if (compute_eigenvectors) + this->values.swap(eigenvectors->values); + + // adapt the size of ev to fit m upon return + while (ev.size() > static_cast(m)) + ev.pop_back(); + } + /* + * send number of computed eigenvalues to inactive processes + */ + grid->send_to_inactive(&m, 1); + + /* + * inactive processes have to resize array of eigenvalues + */ + if (!grid->mpi_process_is_active) + ev.resize(m); + /* + * send the eigenvalues to processors not being part of the process grid + */ + grid->send_to_inactive(ev.data(), ev.size()); + + /* + * if only eigenvalues are queried the content of the matrix will be + * destroyed if the eigenpairs are queried matrix A on exit stores the + * eigenvectors in the columns + */ + if (compute_eigenvectors) + { + property = dftefeScalapack::LAPACKSupport::Property::general; + state = dftefeScalapack::LAPACKSupport::eigenvalues; + } + else + state = dftefeScalapack::LAPACKSupport::unusable; + + return ev; + } + + + template + std::vector + ScaLAPACKMatrix::eigenpairs_hermitian_by_index_MRRR( + const std::pair &index_limits, + const bool compute_eigenvectors) + { + // Check validity of index limits. + AssertIndexRange(index_limits.first, static_cast(n_rows)); + AssertIndexRange(index_limits.second, static_cast(n_rows)); + + const std::pair idx = + std::make_pair(std::min(index_limits.first, index_limits.second), + std::max(index_limits.first, index_limits.second)); + + // Compute all eigenvalues/eigenvectors. + if (idx.first == 0 && idx.second == static_cast(n_rows - 1)) + return eigenpairs_hermitian_MRRR(compute_eigenvectors); + else + return eigenpairs_hermitian_MRRR(compute_eigenvectors, idx); + } + + + template + std::vector + ScaLAPACKMatrix::eigenpairs_hermitian_MRRR( + const bool compute_eigenvectors, + const std::pair &eigenvalue_idx, + const std::pair & eigenvalue_limits) + { + Assert(state == dftefeScalapack::LAPACKSupport::matrix, + dealii::ExcMessage( + "Matrix has to be in Matrix state before calling this function.")); + Assert(property == dftefeScalapack::LAPACKSupport::hermitian, + dealii::ExcMessage( + "Matrix has to be hermitian for this operation.")); + + std::lock_guard lock(mutex); + + const bool use_values = (std::isnan(eigenvalue_limits.first) || + std::isnan(eigenvalue_limits.second)) ? + false : + true; + const bool use_indices = + ((eigenvalue_idx.first == dealii::numbers::invalid_unsigned_int) || + (eigenvalue_idx.second == dealii::numbers::invalid_unsigned_int)) ? + false : + true; + + Assert( + !(use_values && use_indices), + dealii::ExcMessage( + "Prescribing both the index and value range for the eigenvalues is ambiguous")); + + // If computation of eigenvectors is not required, use a sufficiently small + // distributed matrix. + std::unique_ptr> eigenvectors = + compute_eigenvectors ? + std::make_unique>(n_rows, + grid, + row_block_size) : + std::make_unique>( + grid->n_process_rows, grid->n_process_columns, grid, 1, 1); + + eigenvectors->property = property; + // Number of eigenvalues to be returned from psyevr; upon successful exit ev + // contains the m seclected eigenvalues in ascending order. + int m = n_rows; + std::vector ev(n_rows); + + // Number of eigenvectors to be returned; + // Upon successful exit the first m=nz columns contain the selected + // eigenvectors (only if jobz=='V'). + int nz = 0; + + if (grid->mpi_process_is_active) + { + int info = 0; + /* + * For jobz==N only eigenvalues are computed, for jobz='V' also the + * eigenvectors of the matrix are computed. + */ + char jobz = compute_eigenvectors ? 'V' : 'N'; + // Default value is to compute all eigenvalues and optionally + // eigenvectors. + char range = 'A'; + double vl = 0.0; + double vu = 0.0; + int il = 1, iu = 1; + + // Index range for eigenvalues is not specified. + if (!use_indices) + { + // Interval for eigenvalues is not specified and consequently all + // eigenvalues/eigenpairs will be computed. + if (!use_values) + { + range = 'A'; + } + else + { + range = 'V'; + vl = + std::min(eigenvalue_limits.first, eigenvalue_limits.second); + vu = + std::max(eigenvalue_limits.first, eigenvalue_limits.second); + } + } + else + { + range = 'I'; + // As Fortran starts counting/indexing from 1 unlike C/C++, where it + // starts from 0. + il = std::min(eigenvalue_idx.first, eigenvalue_idx.second) + 1; + iu = std::max(eigenvalue_idx.first, eigenvalue_idx.second) + 1; + } + NumberType *A_loc = this->values.data(); + + /* + * By setting lwork to -1 a workspace query for optimal length of work + * is performed. + */ + int lwork = -1; + int liwork = -1; + NumberType *eigenvectors_loc = + (compute_eigenvectors ? eigenvectors->values.data() : nullptr); + work.resize(1); + iwork.resize(1); + + psyevr(&jobz, + &range, + &uplo, + &n_rows, + A_loc, + &submatrix_row, + &submatrix_column, + descriptor, + &vl, + &vu, + &il, + &iu, + &m, + &nz, + ev.data(), + eigenvectors_loc, + &eigenvectors->submatrix_row, + &eigenvectors->submatrix_column, + eigenvectors->descriptor, + work.data(), + &lwork, + iwork.data(), + &liwork, + &info); + AssertThrow(info == 0, + dftefeScalapack::LAPACKSupport::ExcErrorCode("psyevr", + info)); + + lwork = lworkFromWork(work); + work.resize(lwork); + liwork = iwork[0]; + iwork.resize(liwork); + + psyevr(&jobz, + &range, + &uplo, + &n_rows, + A_loc, + &submatrix_row, + &submatrix_column, + descriptor, + &vl, + &vu, + &il, + &iu, + &m, + &nz, + ev.data(), + eigenvectors_loc, + &eigenvectors->submatrix_row, + &eigenvectors->submatrix_column, + eigenvectors->descriptor, + work.data(), + &lwork, + iwork.data(), + &liwork, + &info); + + AssertThrow(info == 0, + dftefeScalapack::LAPACKSupport::ExcErrorCode("psyevr", + info)); + + if (compute_eigenvectors) + AssertThrow( + m == nz, + dealii::ExcMessage( + "psyevr failed to compute all eigenvectors for the selected eigenvalues")); + + // If eigenvectors are queried, copy eigenvectors to original matrix. + // As the temporary matrix eigenvectors has identical dimensions and + // block-cyclic distribution we simply swap the local array. + if (compute_eigenvectors) + this->values.swap(eigenvectors->values); + + // Adapt the size of ev to fit m upon return. + while (ev.size() > static_cast(m)) + ev.pop_back(); + } + /* + * Send number of computed eigenvalues to inactive processes. + */ + grid->send_to_inactive(&m, 1); + + /* + * Inactive processes have to resize array of eigenvalues. + */ + if (!grid->mpi_process_is_active) + ev.resize(m); + /* + * Send the eigenvalues to processors not being part of the process grid. + */ + grid->send_to_inactive(ev.data(), ev.size()); + + /* + * If only eigenvalues are queried, the content of the matrix will be + * destroyed. If the eigenpairs are queried, matrix A on exit stores the + * eigenvectors in the columns. + */ + if (compute_eigenvectors) + { + property = dftefeScalapack::LAPACKSupport::Property::general; + state = dftefeScalapack::LAPACKSupport::eigenvalues; + } + else + state = dftefeScalapack::LAPACKSupport::unusable; + + return ev; + } + + + template class ScaLAPACKMatrix; + template class ScaLAPACKMatrix>; +} // namespace dftefeScalapack diff --git a/src/linearAlgebra/DO_NOT_USE_ScalapckWrapper/scalapackWrapper.h b/src/linearAlgebra/DO_NOT_USE_ScalapckWrapper/scalapackWrapper.h new file mode 100644 index 000000000..4f97d0e7d --- /dev/null +++ b/src/linearAlgebra/DO_NOT_USE_ScalapckWrapper/scalapackWrapper.h @@ -0,0 +1,802 @@ +// --------------------------------------------------------------------- +// +// Copyright (c) 2019-2020 The Regents of the University of Michigan and DFT-FE +// authors. +// +// This file is part of the DFT-FE code. +// +// The DFT-FE code is free software; you can use it, redistribute +// it, and/or modify it under the terms of the GNU Lesser General +// Public License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// The full text of the license can be found in the file LICENSE at +// the top level of the DFT-FE distribution. +// +// --------------------------------------------------------------------- +// + + + +#ifndef ScaLAPACKMatrix_H_ +#define ScaLAPACKMatrix_H_ + +#include "process_grid.h" +#include "lapack_support.h" +#include +#include +#include + + +namespace dftefeScalapack +{ + /** + * @brief Scalapack wrapper adapted from dealii library and extended implementation to complex datatype + * + * @author Sambit Das + */ + template + class ScaLAPACKMatrix + { + public: + /** + * Declare the type for container size. + */ + using size_type = unsigned int; + + /** + * Constructor for a rectangular matrix with @p n_rows and @p n_cols + * and distributed using the grid @p process_grid. + * + * The parameters @p row_block_size and @p column_block_size are the block sizes used + * for the block-cyclic distribution of the matrix. + * In general, it is recommended to use powers of $2$, e.g. $16,32,64, + * \dots$. + */ + ScaLAPACKMatrix( + const size_type n_rows, + const size_type n_columns, + const std::shared_ptr &process_grid, + const size_type row_block_size = 32, + const size_type column_block_size = 32, + const dftefeScalapack::LAPACKSupport::Property property = + dftefeScalapack::LAPACKSupport::Property::general); + + /** + * Constructor for a square matrix of size @p size, and distributed + * using the process grid in @p process_grid. + * + * The parameter @p block_size is used for the block-cyclic distribution of the matrix. + * An identical block size is used for the rows and columns of the matrix. + * In general, it is recommended to use powers of $2$, e.g. $16,32,64, + * \dots$. + */ + ScaLAPACKMatrix( + const size_type size, + const std::shared_ptr &process_grid, + const size_type block_size = 32, + const dftefeScalapack::LAPACKSupport::Property property = + dftefeScalapack::LAPACKSupport::Property::hermitian); + + + /** + * Initialize the rectangular matrix with @p n_rows and @p n_cols + * and distributed using the grid @p process_grid. + * + * The parameters @p row_block_size and @p column_block_size are the block sizes used + * for the block-cyclic distribution of the matrix. + * In general, it is recommended to use powers of $2$, e.g. $16,32,64, + * \dots$. + */ + void + reinit( + const size_type n_rows, + const size_type n_columns, + const std::shared_ptr &process_grid, + const size_type row_block_size = 32, + const size_type column_block_size = 32, + const dftefeScalapack::LAPACKSupport::Property property = + dftefeScalapack::LAPACKSupport::Property::general); + + /** + * Initialize the square matrix of size @p size and distributed using the grid @p process_grid. + * + * The parameter @p block_size is used for the block-cyclic distribution of the matrix. + * An identical block size is used for the rows and columns of the matrix. + * In general, it is recommended to use powers of $2$, e.g. $16,32,64, + * \dots$. + */ + void + reinit( + const size_type size, + const std::shared_ptr &process_grid, + const size_type block_size = 32, + const dftefeScalapack::LAPACKSupport::Property property = + dftefeScalapack::LAPACKSupport::Property::hermitian); + + + /** + * Assign @p property to this matrix. + */ + void + set_property(const dftefeScalapack::LAPACKSupport::Property property); + + /** + * Return current @p property of this matrix + */ + dftefeScalapack::LAPACKSupport::Property + get_property() const; + + /** + * Return current @p state of this matrix + */ + dftefeScalapack::LAPACKSupport::State + get_state() const; + + + /** + * Copy the contents of the distributed matrix into a differently distributed matrix @p dest. + * The function also works for matrices with different process grids + * or block-cyclic distributions. + */ + void + copy_to(ScaLAPACKMatrix &dest) const; + + + /** + * Complex conjugate. + */ + void + conjugate(); + + + /** + * The operations based on the input parameter @p transpose_B and the + * alignment conditions are summarized in the following table: + * + * | transpose_B | Block Sizes | Operation | + * | :---------: | :--------------------------: | :-------------------------------------------: | + * | false | $MB_A=MB_B$
$NB_A=NB_B$ | $\mathbf{A} = a \mathbf{A} + b \mathbf{B}$ | + * | true | $MB_A=NB_B$
$NB_A=MB_B$ | $\mathbf{A} = a \mathbf{A} + b \mathbf{B}^T$ | + * + * The matrices $\mathbf{A}$ and $\mathbf{B}$ must have the same process + * grid. + */ + void + add(const ScaLAPACKMatrix &B, + const NumberType a = 0., + const NumberType b = 1., + const bool transpose_B = false); + + + /** + * The operations based on the input parameter @p conjugate_transpose_B and the + * alignment conditions are summarized in the following table: + * + * | transpose_B | Block Sizes | Operation | + * | :---------: | :--------------------------: | :-------------------------------------------: | + * | false | $MB_A=MB_B$
$NB_A=NB_B$ | $\mathbf{A} = a \mathbf{A} + b \mathbf{B}$ | + * | true | $MB_A=NB_B$
$NB_A=MB_B$ | $\mathbf{A} = a \mathbf{A} + b \mathbf{B}^C$ | + * + * The matrices $\mathbf{A}$ and $\mathbf{B}$ must have the same process + * grid. + */ + void + zadd(const ScaLAPACKMatrix &B, + const NumberType a = 0., + const NumberType b = 1., + const bool conjugate_transpose_B = false); + + /** + * Transposing assignment: $\mathbf{A} = \mathbf{B}^T$ + * + * The matrices $\mathbf{A}$ and $\mathbf{B}$ must have the same process + * grid. + * + * The following alignment conditions have to be fulfilled: $MB_A=NB_B$ and + * $NB_A=MB_B$. + */ + void + copy_transposed(const ScaLAPACKMatrix &B); + + + /** + * Transposing assignment: $\mathbf{A} = \mathbf{B}^C$ + * + * The matrices $\mathbf{A}$ and $\mathbf{B}$ must have the same process + * grid. + * + * The following alignment conditions have to be fulfilled: $MB_A=NB_B$ and + * $NB_A=MB_B$. + */ + void + copy_conjugate_transposed(const ScaLAPACKMatrix &B); + + /** + * Matrix-matrix-multiplication: + * + * The operations based on the input parameters and the alignment conditions + * are summarized in the following table: + * + * | transpose_A | transpose_B | Block Sizes | Operation | + * | :---------: | :---------: | :-------------------------------------------: | :-------------------------------------------------------------: | + * | false | false | $MB_A=MB_C$
$NB_A=MB_B$
$NB_B=NB_C$ | $\mathbf{C} = b \mathbf{A} \cdot \mathbf{B} + c \mathbf{C}$ | + * | false | true | $MB_A=MB_C$
$NB_A=NB_B$
$MB_B=NB_C$ | $\mathbf{C} = b \mathbf{A} \cdot \mathbf{B}^T + c \mathbf{C}$ | + * | true | false | $MB_A=MB_B$
$NB_A=MB_C$
$NB_B=NB_C$ | $\mathbf{C} = b \mathbf{A}^T \cdot \mathbf{B} + c \mathbf{C}$ | + * | true | true | $MB_A=NB_B$
$NB_A=MB_C$
$MB_B=NB_C$ | $\mathbf{C} = b \mathbf{A}^T \cdot \mathbf{B}^T + c \mathbf{C}$ | + * + * It is assumed that $\mathbf{A}$ and $\mathbf{B}$ have compatible sizes + * and that + * $\mathbf{C}$ already has the right size. + * + * The matrices $\mathbf{A}$, $\mathbf{B}$ and $\mathbf{C}$ must have the + * same process grid. + */ + void + mult(const NumberType b, + const ScaLAPACKMatrix &B, + const NumberType c, + ScaLAPACKMatrix & C, + const bool transpose_A = false, + const bool transpose_B = false) const; + + + /** + * Matrix-matrix-multiplication: + * + * The operations based on the input parameters and the alignment conditions + * are summarized in the following table: + * + * | conjugate_transpose_A | conjugate_transpose_B | Block Sizes | Operation | + * | :---------: | :---------: | :-------------------------------------------: | :-------------------------------------------------------------: | + * | false | false | $MB_A=MB_C$
$NB_A=MB_B$
$NB_B=NB_C$ | $\mathbf{C} = b \mathbf{A} \cdot \mathbf{B} + c \mathbf{C}$ | + * | false | true | $MB_A=MB_C$
$NB_A=NB_B$
$MB_B=NB_C$ | $\mathbf{C} = b \mathbf{A} \cdot \mathbf{B}^C + c \mathbf{C}$ | + * | true | false | $MB_A=MB_B$
$NB_A=MB_C$
$NB_B=NB_C$ | $\mathbf{C} = b \mathbf{A}^C \cdot \mathbf{B} + c \mathbf{C}$ | + * | true | true | $MB_A=NB_B$
$NB_A=MB_C$
$MB_B=NB_C$ | $\mathbf{C} = b \mathbf{A}^C \cdot \mathbf{B}^C + c \mathbf{C}$ | + * + * It is assumed that $\mathbf{A}$ and $\mathbf{B}$ have compatible sizes + * and that + * $\mathbf{C}$ already has the right size. + * + * The matrices $\mathbf{A}$, $\mathbf{B}$ and $\mathbf{C}$ must have the + * same process grid. + */ + void + zmult(const NumberType b, + const ScaLAPACKMatrix &B, + const NumberType c, + ScaLAPACKMatrix & C, + const bool conjugate_transpose_A = false, + const bool conjugate_transpose_B = false) const; + + + /** + * Matrix-matrix-multiplication. + * + * The optional parameter @p adding determines whether the result is + * stored in $\mathbf{C}$ or added to $\mathbf{C}$. + * + * if (@p adding) $\mathbf{C} = \mathbf{C} + \mathbf{A} \cdot \mathbf{B}$ + * + * else $\mathbf{C} = \mathbf{A} \cdot \mathbf{B}$ + * + * It is assumed that $\mathbf{A}$ and $\mathbf{B}$ have compatible sizes + * and that + * $\mathbf{C}$ already has the right size. + * + * The following alignment conditions have to be fulfilled: $MB_A=MB_C$, + * $NB_A=MB_B$ and $NB_B=NB_C$. + */ + void + mmult(ScaLAPACKMatrix & C, + const ScaLAPACKMatrix &B, + const bool adding = false) const; + + /** + * Matrix-matrix-multiplication using transpose of $\mathbf{A}$. + * + * The optional parameter @p adding determines whether the result is + * stored in $\mathbf{C}$ or added to $\mathbf{C}$. + * + * if (@p adding) $\mathbf{C} = \mathbf{C} + \mathbf{A}^T \cdot \mathbf{B}$ + * + * else $\mathbf{C} = \mathbf{A}^T \cdot \mathbf{B}$ + * + * It is assumed that $\mathbf{A}$ and $\mathbf{B}$ have compatible sizes + * and that + * $\mathbf{C}$ already has the right size. + * + * The following alignment conditions have to be fulfilled: $MB_A=MB_B$, + * $NB_A=MB_C$ and $NB_B=NB_C$. + */ + void + Tmmult(ScaLAPACKMatrix & C, + const ScaLAPACKMatrix &B, + const bool adding = false) const; + + /** + * Matrix-matrix-multiplication using the transpose of $\mathbf{B}$. + * + * The optional parameter @p adding determines whether the result is + * stored in $\mathbf{C}$ or added to $\mathbf{C}$. + * + * if (@p adding) $\mathbf{C} = \mathbf{C} + \mathbf{A} \cdot \mathbf{B}^T$ + * + * else $\mathbf{C} = \mathbf{A} \cdot \mathbf{B}^T$ + * + * It is assumed that $\mathbf{A}$ and $\mathbf{B}$ have compatible sizes + * and that + * $\mathbf{C}$ already has the right size. + * + * The following alignment conditions have to be fulfilled: $MB_A=MB_C$, + * $NB_A=NB_B$ and $MB_B=NB_C$. + */ + void + mTmult(ScaLAPACKMatrix & C, + const ScaLAPACKMatrix &B, + const bool adding = false) const; + + /** + * Matrix-matrix-multiplication using transpose of $\mathbf{A}$ and + * $\mathbf{B}$. + * + * The optional parameter @p adding determines whether the result is + * stored in $\mathbf{C}$ or added to $\mathbf{C}$. + * + * if (@p adding) $\mathbf{C} = \mathbf{C} + \mathbf{A}^T \cdot + * \mathbf{B}^T$ + * + * else $\mathbf{C} = \mathbf{A}^T \cdot \mathbf{B}^T$ + * + * It is assumed that $\mathbf{A}$ and $\mathbf{B}$ have compatible sizes + * and that + * $\mathbf{C}$ already has the right size. + * + * The following alignment conditions have to be fulfilled: $MB_A=NB_B$, + * $NB_A=MB_C$ and $MB_B=NB_C$. + */ + void + TmTmult(ScaLAPACKMatrix & C, + const ScaLAPACKMatrix &B, + const bool adding = false) const; + + + /** + * Matrix-matrix-multiplication. + * + * The optional parameter @p adding determines whether the result is + * stored in $\mathbf{C}$ or added to $\mathbf{C}$. + * + * if (@p adding) $\mathbf{C} = \mathbf{C} + \mathbf{A} \cdot \mathbf{B}$ + * + * else $\mathbf{C} = \mathbf{A} \cdot \mathbf{B}$ + * + * It is assumed that $\mathbf{A}$ and $\mathbf{B}$ have compatible sizes + * and that + * $\mathbf{C}$ already has the right size. + * + * The following alignment conditions have to be fulfilled: $MB_A=MB_C$, + * $NB_A=MB_B$ and $NB_B=NB_C$. + */ + void + zmmult(ScaLAPACKMatrix & C, + const ScaLAPACKMatrix &B, + const bool adding = false) const; + + /** + * Matrix-matrix-multiplication using conjugate transpose of $\mathbf{A}$. + * + * The optional parameter @p adding determines whether the result is + * stored in $\mathbf{C}$ or added to $\mathbf{C}$. + * + * if (@p adding) $\mathbf{C} = \mathbf{C} + \mathbf{A}^C \cdot \mathbf{B}$ + * + * else $\mathbf{C} = \mathbf{A}^C \cdot \mathbf{B}$ + * + * It is assumed that $\mathbf{A}$ and $\mathbf{B}$ have compatible sizes + * and that + * $\mathbf{C}$ already has the right size. + * + * The following alignment conditions have to be fulfilled: $MB_A=MB_B$, + * $NB_A=MB_C$ and $NB_B=NB_C$. + */ + void + zCmmult(ScaLAPACKMatrix & C, + const ScaLAPACKMatrix &B, + const bool adding = false) const; + + /** + * Matrix-matrix-multiplication using the conjugate transpose of + * $\mathbf{B}$. + * + * The optional parameter @p adding determines whether the result is + * stored in $\mathbf{C}$ or added to $\mathbf{C}$. + * + * if (@p adding) $\mathbf{C} = \mathbf{C} + \mathbf{A} \cdot \mathbf{B}^C$ + * + * else $\mathbf{C} = \mathbf{A} \cdot \mathbf{B}^C$ + * + * It is assumed that $\mathbf{A}$ and $\mathbf{B}$ have compatible sizes + * and that + * $\mathbf{C}$ already has the right size. + * + * The following alignment conditions have to be fulfilled: $MB_A=MB_C$, + * $NB_A=NB_B$ and $MB_B=NB_C$. + */ + void + zmCmult(ScaLAPACKMatrix & C, + const ScaLAPACKMatrix &B, + const bool adding = false) const; + + /** + * Matrix-matrix-multiplication using conjugate transpose of $\mathbf{A}$ + * and + * $\mathbf{B}$. + * + * The optional parameter @p adding determines whether the result is + * stored in $\mathbf{C}$ or added to $\mathbf{C}$. + * + * if (@p adding) $\mathbf{C} = \mathbf{C} + \mathbf{A}^C \cdot + * \mathbf{B}^T$ + * + * else $\mathbf{C} = \mathbf{A}^C \cdot \mathbf{B}^C$ + * + * It is assumed that $\mathbf{A}$ and $\mathbf{B}$ have compatible sizes + * and that + * $\mathbf{C}$ already has the right size. + * + * The following alignment conditions have to be fulfilled: $MB_A=NB_B$, + * $NB_A=MB_C$ and $MB_B=NB_C$. + */ + void + zCmCmult(ScaLAPACKMatrix & C, + const ScaLAPACKMatrix &B, + const bool adding = false) const; + + + /** + * Number of rows of the $M \times N$ matrix. + */ + size_type + m() const; + + /** + * Number of columns of the $M \times N$ matrix. + */ + size_type + n() const; + + /** + * Number of local rows on this MPI processes. + */ + unsigned int + local_m() const; + + /** + * Number of local columns on this MPI process. + */ + unsigned int + local_n() const; + + /** + * Return the global row number for the given local row @p loc_row . + */ + unsigned int + global_row(const unsigned int loc_row) const; + + /** + * Return the global column number for the given local column @p loc_column. + */ + unsigned int + global_column(const unsigned int loc_column) const; + + /** + * Read access to local element. + */ + NumberType + local_el(const unsigned int loc_row, const unsigned int loc_column) const; + + /** + * Write access to local element. + */ + NumberType & + local_el(const unsigned int loc_row, const unsigned int loc_column); + + /** + * Compute the Cholesky factorization of the matrix using ScaLAPACK + * function pXpotrf. The result of the factorization is stored + * in this object. + */ + void + compute_cholesky_factorization(); + + /** + * Compute the LU factorization of the matrix using ScaLAPACK + * function pXgetrf and partial pivoting with row interchanges. + * The result of the factorization is stored in this object. + */ + void + compute_lu_factorization(); + + /** + * Invert the matrix by first computing a Cholesky for hermitian matrices + * or a LU factorization for general matrices and then + * building the actual inverse using pXpotri or + * pXgetri. If the matrix is triangular, the LU factorization + * step is skipped, and pXtrtri is used directly. + * + * If a Cholesky or LU factorization has been applied previously, + * pXpotri or pXgetri are called directly. + * + * The inverse is stored in this object. + */ + void + invert(); + + /** + * Computing selected eigenvalues and, optionally, the eigenvectors of the + * real hermitian matrix $\mathbf{A} \in \mathbb{R}^{M \times M}$. + * + * The eigenvalues/eigenvectors are selected by prescribing a range of indices @p index_limits. + * + * If successful, the computed eigenvalues are arranged in ascending order. + * The eigenvectors are stored in the columns of the matrix, thereby + * overwriting the original content of the matrix. + * + * If all eigenvalues/eigenvectors have to be computed, pass the closed interval $ \left[ 0, M-1 \right] $ in @p index_limits. + * + * Pass the closed interval $ \left[ M-r, M-1 \right] $ if the $r$ largest + * eigenvalues/eigenvectors are desired. + */ + std::vector + eigenpairs_hermitian_by_index( + const std::pair &index_limits, + const bool compute_eigenvectors); + + /** + * Computing selected eigenvalues and, optionally, the eigenvectors of the + * real hermitian matrix $\mathbf{A} \in \mathbb{R}^{M \times M}$ using the + * MRRR algorithm. + * + * The eigenvalues/eigenvectors are selected by prescribing a range of indices @p index_limits. + * + * If successful, the computed eigenvalues are arranged in ascending order. + * The eigenvectors are stored in the columns of the matrix, thereby + * overwriting the original content of the matrix. + * + * If all eigenvalues/eigenvectors have to be computed, pass the closed interval $ \left[ 0, M-1 \right] $ in @p index_limits. + * + * Pass the closed interval $ \left[ M-r, M-1 \right] $ if the $r$ largest + * eigenvalues/eigenvectors are desired. + */ + std::vector + eigenpairs_hermitian_by_index_MRRR( + const std::pair &index_limits, + const bool compute_eigenvectors); + + private: + /** + * Computing selected eigenvalues and, optionally, the eigenvectors. + * The eigenvalues/eigenvectors are selected by either prescribing a range of indices @p index_limits + * or a range of values @p value_limits for the eigenvalues. The function will throw an exception + * if both ranges are prescribed (meaning that both ranges differ from the + * default value) as this ambiguity is prohibited. If successful, the + * computed eigenvalues are arranged in ascending order. The eigenvectors + * are stored in the columns of the matrix, thereby overwriting the original + * content of the matrix. + */ + std::vector + eigenpairs_hermitian( + const bool compute_eigenvectors, + const std::pair &index_limits = + std::make_pair(dealii::numbers::invalid_unsigned_int, + dealii::numbers::invalid_unsigned_int), + const std::pair &value_limits = + std::make_pair(std::numeric_limits::quiet_NaN(), + std::numeric_limits::quiet_NaN())); + + /** + * Computing selected eigenvalues and, optionally, the eigenvectors of the + * real hermitian matrix $\mathbf{A} \in \mathbb{R}^{M \times M}$ using the + * MRRR algorithm. + * The eigenvalues/eigenvectors are selected by either prescribing a range of indices @p index_limits + * or a range of values @p value_limits for the eigenvalues. The function will throw an exception + * if both ranges are prescribed (meaning that both ranges differ from the + * default value) as this ambiguity is prohibited. + * + * By calling this function the original content of the matrix will be + * overwritten. If requested, the eigenvectors are stored in the columns of + * the matrix. Also in the case that just the eigenvalues are required, the + * content of the matrix will be overwritten. + * + * If successful, the computed eigenvalues are arranged in ascending order. + * + * @note Due to a bug in Netlib-ScaLAPACK, either all or no eigenvectors can be computed. + * Therefore, the input @p index_limits has to be set accordingly. Using Intel-MKL this restriction is not required. + */ + std::vector + eigenpairs_hermitian_MRRR( + const bool compute_eigenvectors, + const std::pair &index_limits = + std::make_pair(dealii::numbers::invalid_unsigned_int, + dealii::numbers::invalid_unsigned_int), + const std::pair &value_limits = + std::make_pair(std::numeric_limits::quiet_NaN(), + std::numeric_limits::quiet_NaN())); + + + /** + * local storage + */ + std::vector values; + + /** + * Since ScaLAPACK operations notoriously change the meaning of the matrix + * entries, we record the current state after the last operation here. + */ + dftefeScalapack::LAPACKSupport::State state; + + /** + * Additional property of the matrix which may help to select more + * efficient ScaLAPACK functions. + */ + dftefeScalapack::LAPACKSupport::Property property; + + /** + * A shared pointer to a Utilities::MPI::ProcessGrid object which contains a + * BLACS context and a MPI communicator, as well as other necessary data + * structures. + */ + std::shared_ptr grid; + + /** + * Number of rows in the matrix. + */ + int n_rows; + + /** + * Number of columns in the matrix. + */ + int n_columns; + + /** + * Row block size. + */ + int row_block_size; + + /** + * Column block size. + */ + int column_block_size; + + /** + * Number of rows in the matrix owned by the current process. + */ + int n_local_rows; + + /** + * Number of columns in the matrix owned by the current process. + */ + int n_local_columns; + + /** + * ScaLAPACK description vector. + */ + int descriptor[9]; + + /** + * Workspace array. + */ + mutable std::vector work; + + /** + * Integer workspace array. + */ + mutable std::vector iwork; + + /** + * Integer array holding pivoting information required + * by ScaLAPACK's matrix factorization routines. + */ + std::vector ipiv; + + /** + * A character to define where elements are stored in case + * ScaLAPACK operations support this. + */ + const char uplo; + + /** + * The process row of the process grid over which the first row + * of the global matrix is distributed. + */ + const int first_process_row; + + /** + * The process column of the process grid over which the first column + * of the global matrix is distributed. + */ + const int first_process_column; + + /** + * Global row index that determines where to start a submatrix. + * Currently this equals unity, as we don't use submatrices. + */ + const int submatrix_row; + + /** + * Global column index that determines where to start a submatrix. + * Currently this equals unity, as we don't use submatrices. + */ + const int submatrix_column; + + /** + * Thread mutex. + */ + mutable dealii::Threads::Mutex mutex; + }; + + // ----------------------- inline functions ---------------------------- + +#ifndef DOXYGEN + + template + inline NumberType + ScaLAPACKMatrix::local_el(const unsigned int loc_row, + const unsigned int loc_column) const + { + return values[loc_column * n_local_rows + loc_row]; + // return (*this)(loc_row, loc_column); + } + + + + template + inline NumberType & + ScaLAPACKMatrix::local_el(const unsigned int loc_row, + const unsigned int loc_column) + { + return values[loc_column * n_local_rows + loc_row]; + // return (*this)(loc_row, loc_column); + } + + + template + inline unsigned int + ScaLAPACKMatrix::m() const + { + return n_rows; + } + + + + template + inline unsigned int + ScaLAPACKMatrix::n() const + { + return n_columns; + } + + + + template + unsigned int + ScaLAPACKMatrix::local_m() const + { + return n_local_rows; + } + + + + template + unsigned int + ScaLAPACKMatrix::local_n() const + { + return n_local_columns; + } + + +#endif // DOXYGEN + + +} // namespace dftefeScalapack +#endif // ScaLAPACKMatrix_H_ diff --git a/src/linearAlgebra/GeneralMatrix.h b/src/linearAlgebra/GeneralMatrix.h index a632560de..aefc1d2ea 100644 --- a/src/linearAlgebra/GeneralMatrix.h +++ b/src/linearAlgebra/GeneralMatrix.h @@ -44,6 +44,90 @@ namespace dftefe size_t nb = global_nb, size_t mb = global_mb); + /** + * @brief The value setter interface for the matrix class. This setter + * assumes the given pointer to ValueType to be a serially repeated + * matrix and copy the data to the corresponding local owned + * location trivially. + * @warning There is no boundary check in this function. The user should + * be responsible that the size of data array should have d_m*d_n + * size. + * @param data The pointer to the data to be copied from. + */ + virtual void + setValues(const ValueType *data) override; + + /** + * @brief The value setter interface for the matrix class. This setter + * assumes the given pointer to ValueType to be a serially repeated + * matrix and copy the data to the corresponding local owned + * sub-matrix (i1:i2-1, j1:j2-1) trivially. + * @warning There is no boundary check in this function. The user should + * be responsible that the size of data array should have + * (j2-j1)*(i2-i1) size. + * @param i1 the global index of the first row of the submatrix. + * @param i2 one passes the global index of the last row of the submatrix. + * @param j1 the global index of the first column of the submatrix. + * @param j2 one passes the global index of the last column of the + * submatrix. + * @param data The pointer to the data to be copied from. The user should + * be responsible that the size of data array should have + * (j2-j1)*(i2-i1) size. + */ + virtual void + setValues(size_t i1, + size_t i2, + size_t j1, + size_t j2, + const ValueType *data) override; + + // /** + // * @brief The value setter interface for the matrix class. This setter + // * allows data to be different on each processor. The + // locally owned + // * data contains two parts: (1) values belongs to the + // locally owned + // * portion of the matrix and (2) values belong to the + // off-processor + // * portion of the matrix. This setter will distribute the + // * off-processor values correspondingly. + // * @warning This routine currently is not optimized and fully + // * tested. Using this routing to assign values can + // * deteriorate the performance dramatically. This should + // * only be used for debugging purpose or not performance + // * relevant part of the code. + // * @param i1 the global index of the first row of the submatrix. + // * @param i2 one passes the global index of the last row of the submatrix. + // * @param j1 the global index of the first column of the submatrix. + // * @param j2 one passes the global index of the last column of the + // * submatrix. + // * @param data The pointer to the data to be copied from. The user should + // * be responsible that the size of data array should + // have + // * (j2-j1)*(i2-i1) size. + // */ + // virtual void + // setDistributedValues(size_t i1, + // size_t i2, + // size_t j1, + // size_t j2, + // const ValueType *data) = 0; + // + // /** + // * @brief The value setter interface for inserting a single value. + // * @warning This routine is sub-optimal and can seriously deteriorate the + // * the performance. It should be only used when + // necessary. Only + // * the processor which owns (i, j) element will be + // inserting + // * value. + // * @param i the row index of the value. + // * @param j the column index of the value. + // * @param d the value. + // */ + // virtual void + // setValue(size_t i, size_t j, ValueType d) = 0; + slate::Matrix & getSlateMatrix() const; diff --git a/src/linearAlgebra/GeneralMatrix.t.cpp b/src/linearAlgebra/GeneralMatrix.t.cpp index c94ebe68b..731340774 100644 --- a/src/linearAlgebra/GeneralMatrix.t.cpp +++ b/src/linearAlgebra/GeneralMatrix.t.cpp @@ -49,12 +49,35 @@ namespace dftefe { d_matrix->insertLocalTiles(slate::Target::Host); } + AbstractMatrix::initSlateMatrix(d_matrix); } + + template slate::Matrix & GeneralMatrix::getSlateMatrix() const { - return d_matrix; + return *d_matrix; + } + + template + void + GeneralMatrix::setValues(const ValueType *data) + { + AbstractMatrix::setValueSlateMatrix(d_baseMatrix, + data); + } + + template + void + GeneralMatrix::setValues(size_t i1, + size_t i2, + size_t j1, + size_t j2, + const ValueType *data) + { + slate::Matrix mat = d_matrix->sub(i1, i2, j1, j2); + AbstractMatrix::setValueSlateMatrix(&mat, data); } } // namespace linearAlgebra } // namespace dftefe diff --git a/src/linearAlgebra/HermitianMatrix.h b/src/linearAlgebra/HermitianMatrix.h index d04253a57..6b3522674 100644 --- a/src/linearAlgebra/HermitianMatrix.h +++ b/src/linearAlgebra/HermitianMatrix.h @@ -43,6 +43,43 @@ namespace dftefe size_t q, size_t nb = global_nb); + /** + * @brief The value setter interface for the matrix class. This setter + * assumes the given pointer to ValueType to be a serially repeated + * matrix and copy the data to the corresponding local owned + * location trivially. + * @warning There is no boundary check in this function. The user should + * be responsible that the size of data array should have d_m*d_n + * size. + * @param data The pointer to the data to be copied from. + */ + virtual void + setValues(const ValueType *data) override; + + /** + * @brief The value setter interface for the matrix class. This setter + * assumes the given pointer to ValueType to be a serially repeated + * matrix and copy the data to the corresponding local owned + * sub-matrix (i1:i2-1, j1:j2-1) trivially. + * @warning There is no boundary check in this function. The user should + * be responsible that the size of data array should have + * (j2-j1)*(i2-i1) size. + * @param i1 the global index of the first row of the submatrix. + * @param i2 one passes the global index of the last row of the submatrix. + * @param j1 the global index of the first column of the submatrix. + * @param j2 one passes the global index of the last column of the + * submatrix. + * @param data The pointer to the data to be copied from. The user should + * be responsible that the size of data array should have + * (j2-j1)*(i2-i1) size. + */ + virtual void + setValues(size_t i1, + size_t i2, + size_t j1, + size_t j2, + const ValueType *data) override; + slate::HermitianMatrix & getSlateMatrix() const; diff --git a/src/linearAlgebra/HermitianMatrix.t.cpp b/src/linearAlgebra/HermitianMatrix.t.cpp index 87549d9b4..0e66592f3 100644 --- a/src/linearAlgebra/HermitianMatrix.t.cpp +++ b/src/linearAlgebra/HermitianMatrix.t.cpp @@ -48,12 +48,30 @@ namespace dftefe { d_matrix->insertLocalTiles(slate::Target::Host); } + AbstractMatrix::initSlateMatrix(d_matrix); } template slate::HermitianMatrix & HermitianMatrix::getSlateMatrix() const { - return d_matrix; + return *d_matrix; } + + template + void + HermitianMatrix::setValues(const ValueType *data) + { + AbstractMatrix::setValueSlateMatrix(d_baseMatrix, + data); + } + + template + void + HermitianMatrix::setValues(size_t i1, + size_t i2, + size_t j1, + size_t j2, + const ValueType *data) + {} } // namespace linearAlgebra } // namespace dftefe diff --git a/src/linearAlgebra/DistributedDenseMatrix.cpp b/src/linearAlgebra/MatVecOperations.cpp similarity index 95% rename from src/linearAlgebra/DistributedDenseMatrix.cpp rename to src/linearAlgebra/MatVecOperations.cpp index fcc3ff860..b18012fa0 100644 --- a/src/linearAlgebra/DistributedDenseMatrix.cpp +++ b/src/linearAlgebra/MatVecOperations.cpp @@ -20,10 +20,7 @@ ******************************************************************************/ /* - * @author Ian C. Lin. + * @author Sambit Das. */ -#include "DistributedDenseMatrix.h" - -namespace dftefe -{} +#include "MatVecOperations.h" diff --git a/src/linearAlgebra/MatVecOperations.h b/src/linearAlgebra/MatVecOperations.h new file mode 100644 index 000000000..d575f7746 --- /dev/null +++ b/src/linearAlgebra/MatVecOperations.h @@ -0,0 +1,136 @@ +/****************************************************************************** + * Copyright (c) 2021. * + * The Regents of the University of Michigan and DFT-EFE developers. * + * * + * This file is part of the DFT-EFE code. * + * * + * DFT-EFE is free software: you can redistribute it and/or modify * + * it under the terms of the Lesser GNU General Public License as * + * published by the Free Software Foundation, either version 3 of * + * the License, or (at your option) any later version. * + * * + * DFT-EFE is distributed in the hope that it will be useful, but * + * WITHOUT ANY WARRANTY; without even the implied warranty * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * + * See the Lesser GNU General Public License for more details. * + * * + * You should have received a copy of the GNU Lesser General Public * + * License at the top level of DFT-EFE distribution. If not, see * + * . * + ******************************************************************************/ + +/* + * @author Sambit Das + */ + +#ifndef dftefeMatVecOperations_h +#define dftefeMatVecOperations_h + +#include +#include +#include +#include +#include +#include +#include + +namespace dftefe +{ + namespace linearAlgebra + { + namespace MatVecOperations + { + /** + * @brief Compute \f$ {\bf S}= {\bf A}^{\dagger} {\bf A} \f$ where + * \f$ {\bf A} \f$ is a dense matrix of size M times N with distributed + * memory parallelization over the rows. \f$ {\bf A} \f$ is stored + * as a MultiVector with number of vectors to be N and row major + * storage. \f$ {\bf S} \f$ is the N times N Hermitian overlap matrix + * stored in block-cyclic format. The parallelization of \f$ {\bf S} \f$ + * is independent of that of \f$ {\bf A} \f$ + * @param[in] A the dense matrix stored as a MultiVector + * @param[in,out] S preallocated Hermitian overlap matrix + * @param[in] context LinAlg context + * @param[in] NBlockSize determines the block size for + * blocked loop over the N vectors during the computation + * of \f$ {\bf S} \f$, for default value of 0 it is heuristically + * determined. This aspect to required for reducing the local MPI task + * peak memory scaling to \f$ \approx \f$ N times vectorsBlockSize + * instead of N times N + */ + template + void + computeAMultiVecConjTransTimesAMultiVec( + const MultiVector &A, + HermitianMatrix & S, + LinAlgOpContext & context, + const size_type NBlockSize = 0); + + /** + * @brief In-place computation of \f$ {\bf A}= {\bf A} {\bf T} \f$ where + * \f$ {\bf A} \f$ is a dense matrix of size M times N with distributed + * memory parallelization over the rows. \f$ {\bf A} \f$ is stored + * as a MultiVector with number of vectors to be N and row major + * storage. \f$ {\bf T} \f$ is either an upper or lower triangular + * N times N matrix stored in block cyclic format. The parallelization + * of \f$ {\bf T} \f$ is independent of that of \f$ {\bf A} \f$ + * @param[in,out] A the dense matrix stored as a MultiVector + * @param[in] T preallocated triangular matrix + * @param[in] context LinAlg context + * @param[in] MBlockSize determines the block size for + * blocked loop over the M vector size during the in-place computation. + * Default value of 0 sets block size heuristically. + * @param[in] NBlockSize determines the block size for + * blocked loop over the N vectors. Default value of 0 sets + * block size heuristically. + */ + template + void + computeAMultiVecTimesTriangularMat( + MultiVector & A, + const TriangularMatrix &T, + LinAlgOpContext & context, + const size_type MBlockSize = 0, + const size_type NBlockSize = 0); + + + /** + * @brief In-place computation of \f$ {\bf A}= {\bf A} {\bf G} \f$ where + * \f$ {\bf A} \f$ is a dense matrix of size M times N with distributed + * memory parallelization over the rows. \f$ {\bf A} \f$ is stored + * as a MultiVector with number of vectors to be N and row major + * storage. \f$ {\bf G} \f$ is a N times N general matrix + * stored in block cyclic format. The parallelization + * of \f$ {\bf G} \f$ is independent of that of \f$ {\bf A} \f$ + * @param[in,out] A the dense matrix stored as a MultiVector + * @param[in] G preallocated general matrix + * @param[in] context LinAlg context + * @param[in] MBlockSize determines the block size for + * blocked loop over the M vector size during the in-place computation + * Default value of 0 sets block size heuristically. + * @param[in] NBlockSize determines the block size for + * blocked loop over the N vectors. Default value of 0 sets block size + * heuristically. + */ + template + void + computeAMultiVecTimesGeneralMat( + MultiVector & A, + const GeneralMatrix &G, + LinAlgOpContext & context, + const size_type MBlockSize = 0, + const size_type NBlockSize = 0); + + + template + void + choleskyOrthogonalization(MultiVector &A, + LinAlgOpContext & context); + + } // namespace MatVecOperations + } // namespace linearAlgebra + +} // namespace dftefe + +#include "MatVecOperations.t.cpp" +#endif // dftefeMatVecOperations_h diff --git a/src/linearAlgebra/MatVecOperations.t.cpp b/src/linearAlgebra/MatVecOperations.t.cpp new file mode 100644 index 000000000..a3d6a6332 --- /dev/null +++ b/src/linearAlgebra/MatVecOperations.t.cpp @@ -0,0 +1,143 @@ +/****************************************************************************** + * Copyright (c) 2021. * + * The Regents of the University of Michigan and DFT-EFE developers. * + * * + * This file is part of the DFT-EFE code. * + * * + * DFT-EFE is free software: you can redistribute it and/or modify * + * it under the terms of the Lesser GNU General Public License as * + * published by the Free Software Foundation, either version 3 of * + * the License, or (at your option) any later version. * + * * + * DFT-EFE is distributed in the hope that it will be useful, but * + * WITHOUT ANY WARRANTY; without even the implied warranty * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * + * See the Lesser GNU General Public License for more details. * + * * + * You should have received a copy of the GNU Lesser General Public * + * License at the top level of DFT-EFE distribution. If not, see * + * . * + ******************************************************************************/ + +/* + * @author Sambit Das + */ + +#include + +namespace dftefe +{ + namespace linearAlgebra + { + template + void + MatVecOperations::computeAMultiVecConjTransTimesAMultiVec( + const MultiVector &A, + HermitianMatrix & S, + LinAlgOpContext & context, + const size_type NBlockSize) + { + const size_type N = A.numVectors(); + + const size_type maxEntriesInBlock = + 40000 * 400; // about 250 MB in FP64 datatype + const size_type NBlockSizeUsed = + NBlockSize == 0 ? (maxEntriesInBlock / N) : NBlockSize; + + dftefe::utils::MemoryStorage overlapMatrixBlock( + N * NBlockSizeUsed, 0); + + for (size_type ivec = 0; ivec < N; ivec += NBlockSizeUsed) + { + // Correct block dimensions if block "goes off edge of" the matrix + const size_type B = std::min(NBlockSizeUsed, N - ivec); + + const blasLapack::Op + transA = blasLapack::Op::NoTrans, + transB = std::is_same>::value ? + blasLapack::Op::ConjTrans : + blasLapack::Op::Trans; + const ValueType scalarCoeffAlpha = 1.0, scalarCoeffBeta = 0.0; + + // member function to be created + overlapMatrixBlock = 0.; + + const unsigned int D = N - ivec; + + // Block1: (0,M-1,ivec-1,N-1) + // Block2: (0,M-1,ivec-1,ivec+B-1) + // Comptute local ABlock1^{Trans}*ABlock2Conj. + blasLapack::gemm( + blasLapack::Layout::ColMajor, + transA, + transB, + D, + B, + A.getMPIPatternP2P()->localOwnedSize(), + scalarCoeffAlpha, + A.data() + ivec, + N, + A.data() + ivec, + N, + scalarCoeffBeta, + overlapMatrixBlock.data(), + D, + context); + + + // Sum local ABlock1^{Trans}*ABlock2Conj across MPI tasks + utils::mpi::MPIAllreduce( + MPI_IN_PLACE, + overlapMatrixBlock.data(), + D * B * sizeof(ValueType), + utils::mpi::MPIByte, + utils::mpi::MPISum, + A.getMPIPatternP2P()->mpiCommunicator()); + + + // Copy only the lower/upper triangular part to the Hermitian + // overlap matrix, the matrix object already assumed to be created + // outside this function + // FIXME:The setValues function should work for the case where + // the Hermitian matrix is actually storing upper triangular values + // and the setValues function tries to set for the lower triangular + // part + S.setValues( + ivec - 1, N - 1, ivec - 1, ivec + B - 1, overlapMatrixBlock.data()); + } + + // FIXME: to be implemented (not complex conjugate transpose) + // S.complexConjugate(); + } + + + template + void + computeAMultiVecTimesTriangularMat( + MultiVector & A, + const TriangularMatrix &T, + LinAlgOpContext & context, + const size_type MBlockSize = 0, + const size_type NBlockSize = 0) + {} + + + template + void + computeAMultiVecTimesGeneralMat( + MultiVector & A, + const GeneralMatrix &T, + LinAlgOpContext & context, + const size_type MBlockSize = 0, + const size_type NBlockSize = 0) + {} + + + template + void + MatVecOperations::choleskyOrthogonalization( + MultiVector &A, + LinAlgOpContext & context) + {} + } // namespace linearAlgebra +} // namespace dftefe diff --git a/src/linearAlgebra/MatrixOperations.t.cpp b/src/linearAlgebra/MatrixOperations.t.cpp index 0a642ee34..deabe899f 100644 --- a/src/linearAlgebra/MatrixOperations.t.cpp +++ b/src/linearAlgebra/MatrixOperations.t.cpp @@ -37,22 +37,28 @@ namespace dftefe ValueType beta, GeneralMatrix &C) { - if (memorySpace == dftefe::utils::MemorySpace::DEVICE) - { - slate::multiply(alpha, - A.getSlateMatrix(), - B.getSlateMatrix(), - beta, - {slate::Option::Target, slate::Target::Devices}); - } - else - { - slate::multiply(alpha, - A.getSlateMatrix(), - B.getSlateMatrix(), - beta, - {slate::Option::Target, slate::Target::HostTask}); - } + // if (memorySpace == dftefe::utils::MemorySpace::DEVICE) + // { + // slate::multiply(alpha, + // A.getSlateMatrix(), + // B.getSlateMatrix(), + // beta, + // {slate::Option::Target, + // slate::Target::Devices}); + // } + // else + // { + // slate::multiply(alpha, + // A.getSlateMatrix(), + // B.getSlateMatrix(), + // beta, + // {slate::Option::Target, slate::Target::Host}); + slate::multiply(alpha, + A.getSlateMatrix(), + B.getSlateMatrix(), + beta, + C.getSlateMatrix()); + // } } @@ -78,7 +84,7 @@ namespace dftefe A.getSlateMatrix(), B.getSlateMatrix(), beta, - {slate::Option::Target, slate::Target::HostTask}); + {slate::Option::Target, slate::Target::Host}); } } @@ -106,7 +112,7 @@ namespace dftefe A.getSlateMatrix(), B.getSlateMatrix(), beta, - {slate::Option::Target, slate::Target::HostTask}); + {slate::Option::Target, slate::Target::Host}); } } @@ -127,7 +133,7 @@ namespace dftefe { slate::multiply(A.getSlateMatrix(), B.getSlateMatrix(), - {slate::Option::Target, slate::Target::HostTask}); + {slate::Option::Target, slate::Target::Host}); } } @@ -149,7 +155,7 @@ namespace dftefe { slate::multiply(A.getSlateMatrix(), B.getSlateMatrix(), - {slate::Option::Target, slate::Target::HostTask}); + {slate::Option::Target, slate::Target::Host}); } } @@ -171,8 +177,7 @@ namespace dftefe { slate::triangular_solve(A.getSlateMatrix(), B.getSlateMatrix(), - {slate::Option::Target, - slate::Target::HostTask}); + {slate::Option::Target, slate::Target::Host}); } } @@ -195,8 +200,7 @@ namespace dftefe { slate::triangular_solve(A.getSlateMatrix(), B.getSlateMatrix(), - {slate::Option::Target, - slate::Target::HostTask}); + {slate::Option::Target, slate::Target::Host}); } } diff --git a/src/linearAlgebra/DistributedDenseMatrix.h b/src/linearAlgebra/MatrixProperties.h similarity index 83% rename from src/linearAlgebra/DistributedDenseMatrix.h rename to src/linearAlgebra/MatrixProperties.h index c447479e5..7f1682a60 100644 --- a/src/linearAlgebra/DistributedDenseMatrix.h +++ b/src/linearAlgebra/MatrixProperties.h @@ -23,21 +23,15 @@ * @author Ian C. Lin. */ -#ifndef dftefeDistributedDenseMatrix_h -#define dftefeDistributedDenseMatrix_h - -#include "Matrix.h" +#ifndef dftefeMatrixProperties_h +#define dftefeMatrixProperties_h namespace dftefe { namespace linearAlgebra { - template - class DistributedDenseMatrix : protected Matrix - {}; - } // namespace linearAlgebra + enum class Uplo + } } // namespace dftefe - - -#endif // dftefeDistributedDenseMatrix_h +#endif // dftefeMatrixProperties_h diff --git a/src/main.cpp b/src/main.cpp index a62c06021..7981d13bc 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -10,7 +10,8 @@ #include #include - +#include +#include #ifdef DFTEFE_WITH_MPI # include #endif @@ -19,6 +20,9 @@ # include #endif +#include +#include + int main(int argc, char **argv) { @@ -53,4 +57,51 @@ main(int argc, char **argv) } printf("This is gpu code"); #endif + + std::vector dA(25, 0.0), dB(15, 0.0); + for (int i = 0; i < 25; ++i) + dA[i] = i + 1; + for (int i = 0; i < 15; ++i) + dB[i] = i + 25; + + // dftefe::linearAlgebra::HermitianMatrix + // A(dftefe::linearAlgebra::Uplo::Lower, 5, MPI_COMM_SELF, 1, 1); + + dftefe::linearAlgebra::GeneralMatrix + A(5, 5, MPI_COMM_SELF, 1, 1, 5, 5); + // B(5, 3, MPI_COMM_SELF, 1, 1, 3, 5), + // C(5, 3, MPI_COMM_SELF, 1, 1, 3, 5); + // A.setValues(dA.data()); + // A.setValues(1, 4, 2, 3, dA.data()); + // B.setValues(dB.data()); + slate::print("matrix A", A.getSlateMatrix()); + // slate::print("matrix B", B.getSlateMatrix()); + // dftefe::linearAlgebra::MatrixOperations::multiply(1.0, A, B, 0.0, C); + + // blas::gemm(blas::Layout::ColMajor, blas::Op::NoTrans, blas::Op::NoTrans, + // 5, 3, 5, 1.0, A.getSlateMatrix()(0,0).data(), 5, + // B.getSlateMatrix()(0,0).data(), 5, 0.0, C.getSlateMatrix()(0,0).data(), + // 5); slate::print("matrix C", C.getSlateMatrix()); + + // std::vector dD = {1.148258296912840, 1.001372859769502, + // 0.994403249881076, 1.638906883577435, 1.316661477930142, 1.001372859769502, + // 2.019228542698278, 1.612894116737191, 1.932571381399861, 2.047167619624847, + // 0.994403249881076, 1.612894116737191, 2.790965744547513, 2.622748446448737, + // 2.828958221480860, 1.638906883577435, 1.932571381399861, 2.622748446448737, + // 3.486600154718786, 3.283956672039348, 1.316661477930142, 2.047167619624847, + // 2.828958221480860, 3.283956672039348, 3.352644057275798}; + // dftefe::linearAlgebra::GeneralMatrix + // D(5, 5, MPI_COMM_SELF, 1, 1, 5, 5), Z(5, 5, MPI_COMM_SELF, 1, 1, 5, + // 5); + // D.setValues(dD.data()); + // slate::print("matrix", D.getSlateMatrix()); + // + // slate::SymmetricMatrix Dh(slate::Uplo::Lower, + // D.getSlateMatrix()); slate::print("matrix", Dh); + // lapack::syev(lapack::Job::Vec, lapack::Uplo::Lower, 5, Dh(0,0).data(), + // 5, Z.getSlateMatrix()(0,0).data()); slate::print("matrix", + // Z.getSlateMatrix()); slate::print("matrix", Dh); }