diff --git a/src/linearAlgebra/AbstractMatrix.h b/src/linearAlgebra/AbstractMatrix.h
index 2d8ccbecc..71e7d4739 100644
--- a/src/linearAlgebra/AbstractMatrix.h
+++ b/src/linearAlgebra/AbstractMatrix.h
@@ -40,6 +40,93 @@ namespace dftefe
     template <typename ValueType, dftefe::utils::MemorySpace memorySpace>
     class AbstractMatrix
     {
+    public:
+      /**
+       * @brief The value setter interface for the matrix class. This setter
+       *        assumes the given pointer to ValueType to be a serially repeated
+       *        matrix and copy the data to the corresponding local owned
+       *        location trivially.
+       * @warning There is no boundary check in this function. The user should
+       *          be responsible that the size of data array should have d_m*d_n
+       *          size.
+       * @param data The pointer to the data to be copied from.
+       */
+      virtual void
+      setValues(const ValueType *data) = 0;
+
+      /**
+       * @brief The value setter interface for the matrix class. This setter
+       *        assumes the given pointer to ValueType to be a serially repeated
+       *        matrix and copy the data to the corresponding local owned
+       *        sub-matrix (i1:i2-1, j1:j2-1) trivially.
+       * @warning There is no boundary check in this function. The user should
+       *          be responsible that the size of data array should have
+       *          (j2-j1)*(i2-i1) size.
+       * @param i1 the global index of the first row of the submatrix.
+       * @param i2 one passes the global index of the last row of the submatrix.
+       * @param j1 the global index of the first column of the submatrix.
+       * @param j2 one passes the global index of the last column of the
+       *           submatrix.
+       * @param data The pointer to the data to be copied from. The user should
+       *             be responsible that the size of data array should have
+       *             (j2-j1)*(i2-i1) size.
+       */
+      virtual void
+      setValues(size_t           i1,
+                size_t           i2,
+                size_t           j1,
+                size_t           j2,
+                const ValueType *data) = 0;
+
+      //      /**
+      //       * @brief The value setter interface for the matrix class. This setter
+      //       *        allows data to be different on each processor. The
+      //       locally owned
+      //       *        data contains two parts: (1) values belongs to the
+      //       locally owned
+      //       *        portion of the matrix and (2) values belong to the
+      //       off-processor
+      //       *        portion of the matrix. This setter will distribute the
+      //       *        off-processor values correspondingly.
+      //       * @warning This routine currently is not optimized and fully
+      //       *          tested. Using this routing to assign values can
+      //       *          deteriorate the performance dramatically. This should
+      //       *          only be used for debugging purpose or not performance
+      //       *          relevant part of the code.
+      //       * @param i1 the global index of the first row of the submatrix.
+      //       * @param i2 one passes the global index of the last row of the submatrix.
+      //       * @param j1 the global index of the first column of the submatrix.
+      //       * @param j2 one passes the global index of the last column of the
+      //       *           submatrix.
+      //       * @param data The pointer to the data to be copied from. The user should
+      //       *             be responsible that the size of data array should
+      //       have
+      //       *             (j2-j1)*(i2-i1) size.
+      //       */
+      //      virtual void
+      //      setDistributedValues(size_t           i1,
+      //                           size_t           i2,
+      //                           size_t           j1,
+      //                           size_t           j2,
+      //                           const ValueType *data) = 0;
+      //
+      //      /**
+      //       * @brief The value setter interface for inserting a single value.
+      //       * @warning This routine is sub-optimal and can seriously deteriorate the
+      //       *          the performance. It should be only used when
+      //       necessary. Only
+      //       *          the processor which owns (i, j) element will be
+      //       inserting
+      //       *          value.
+      //       * @param i the row index of the value.
+      //       * @param j the column index of the value.
+      //       * @param d the value.
+      //       */
+      //      virtual void
+      //      setValue(size_t i, size_t j, ValueType d) = 0;
+
+      AbstractMatrix() = delete;
+
     protected:
       AbstractMatrix(size_t   m,
                      size_t   n,
@@ -51,7 +138,24 @@ namespace dftefe
       size_t                        d_m, d_n;
       size_t                        d_mb, d_nb, d_p, d_q;
       MPI_Comm                      d_comm;
+      int                           d_num_ranks;
       slate::BaseMatrix<ValueType> *d_baseMatrix = nullptr;
+
+      void
+      setValueSlateMatrix(slate::BaseMatrix<ValueType> *matrix,
+                          const ValueType *             data);
+
+      void
+      setValueSlateMatrix(slate::BaseMatrix<ValueType> *matrix,
+                          const ValueType *             data,
+                          int                           i1,
+                          int                           i2,
+                          int                           j1,
+                          int                           j2);
+
+
+      void
+      initSlateMatrix(slate::BaseMatrix<ValueType> *matrix, ValueType val = 0);
     };
 
   } // namespace linearAlgebra
diff --git a/src/linearAlgebra/AbstractMatrix.t.cpp b/src/linearAlgebra/AbstractMatrix.t.cpp
index 25a39b375..30223453c 100644
--- a/src/linearAlgebra/AbstractMatrix.t.cpp
+++ b/src/linearAlgebra/AbstractMatrix.t.cpp
@@ -23,6 +23,8 @@
  * @author Ian C. Lin.
  */
 
+#include "AbstractMatrix.h"
+
 namespace dftefe
 {
   namespace linearAlgebra
@@ -44,6 +46,95 @@ namespace dftefe
       , d_mb(mb)
     {}
 
+    template <typename ValueType, dftefe::utils::MemorySpace memorySpace>
+    void
+    AbstractMatrix<ValueType, memorySpace>::setValues(const ValueType *data)
+    {
+      setValueSlateMatrix(d_baseMatrix, data);
+    }
+
+    template <typename ValueType, dftefe::utils::MemorySpace memorySpace>
+    void
+    AbstractMatrix<ValueType, memorySpace>::initSlateMatrix(
+      slate::BaseMatrix<ValueType> *matrix,
+      ValueType                     val)
+    {
+      for (int64_t j = 0, j_offset = 0; j < matrix->nt();
+           j_offset += matrix->tileNb(j++))
+        {
+          for (int64_t i = 0, i_offset = 0; i < matrix->mt();
+               i_offset += matrix->tileMb(i++))
+            {
+              if (matrix->tileIsLocal(i, j))
+                {
+                  matrix->at(i, j).set(val);
+                }
+            }
+        }
+    }
 
+    template <typename ValueType, dftefe::utils::MemorySpace memorySpace>
+    void
+    AbstractMatrix<ValueType, memorySpace>::setValueSlateMatrix(
+      slate::BaseMatrix<ValueType> *matrix,
+      const ValueType *             data)
+    {
+      for (int64_t j = 0, j_offset = 0; j < matrix->nt();
+           j_offset += matrix->tileNb(j++))
+        {
+          for (int64_t i = 0, i_offset = 0; i < matrix->mt();
+               i_offset += matrix->tileMb(i++))
+            {
+              if (matrix->tileIsLocal(i, j))
+                {
+                  slate::Tile<double> T = (*matrix)(i, j);
+                  // todo: check for transpose case (d_m and d_n)
+                  int64_t mb = T.mb(), nb = T.nb(),
+                          offset = i_offset + j_offset * d_m;
+                  lapack::lacpy(lapack::MatrixType::General,
+                                mb,
+                                nb,
+                                &data[offset],
+                                d_m,
+                                T.data(),
+                                mb);
+                }
+            }
+        }
+    }
+
+    template <typename ValueType, dftefe::utils::MemorySpace memorySpace>
+    void
+    AbstractMatrix<ValueType, memorySpace>::setValueSlateMatrix(
+      slate::BaseMatrix<ValueType> *matrix,
+      const ValueType *             data,
+      int                           i1,
+      int                           i2,
+      int                           j1,
+      int                           j2)
+    {
+      for (int64_t j = 0, j_offset = 0; j < matrix->nt();
+           j_offset += matrix->tileNb(j++))
+        {
+          for (int64_t i = 0, i_offset = 0; i < matrix->mt();
+               i_offset += matrix->tileMb(i++))
+            {
+              if (matrix->tileIsLocal(i, j))
+                {
+                  slate::Tile<double> T = (*matrix)(i, j);
+                  // todo: check for transpose case (d_m and d_n)
+                  int64_t mb = T.mb(), nb = T.nb(),
+                          offset = i_offset + j_offset * d_m;
+                  int64_t ii1    = lapack::lacpy(lapack::MatrixType::General,
+                                              mb,
+                                              nb,
+                                              &data[offset],
+                                              d_m,
+                                              T.data(),
+                                              mb);
+                }
+            }
+        }
+    }
   } // namespace linearAlgebra
 } // namespace dftefe
diff --git a/src/linearAlgebra/CMakeLists.txt b/src/linearAlgebra/CMakeLists.txt
index 0b19d7f6f..5eb545681 100644
--- a/src/linearAlgebra/CMakeLists.txt
+++ b/src/linearAlgebra/CMakeLists.txt
@@ -13,20 +13,29 @@ if (NOT TARGET dft-efe-linalg)
             QueueManager.cpp
             LinAlgOpContext.cpp
             VectorAttributes.cpp
-	    Vector.cpp
-	    MultiVector.cpp
+	          Vector.cpp
+	          MultiVector.cpp
             AbstractMatrix.cpp
             GeneralMatrix.cpp
             HermitianMatrix.cpp
             TriangularMatrix.cpp
             MatrixKernels.cu
             MatrixKernels.cpp
-            MatrixOperations.cpp)
+            MatrixOperations.cpp
+            MatVecOperations.cpp)
 
     add_library(dft-efe-linalg SHARED ${DFT-EFE-LINALG-SOURCES})
 
     include_directories(../)
 
+    find_package(deal.II 9.3.0 REQUIRED HINTS ${DEALII_PATH})
+    target_include_directories(dft-efe-linalg PUBLIC ${DEAL_II_INCLUDE_DIRS})
+    IF("${CMAKE_BUILD_TYPE}" STREQUAL "Release")
+        target_link_libraries (dft-efe-linalg PUBLIC ${DEAL_II_LIBRARIES_RELEASE})
+    ELSE()
+        target_link_libraries (dft-efe-linalg PUBLIC ${DEAL_II_LIBRARIES_DEBUG})
+    ENDIF()
+
     if (ENABLE_MPI)
         add_compile_definitions(DFTEFE_WITH_MPI)
         if (NOT MPI_FOUND)
@@ -55,5 +64,5 @@ if (NOT TARGET dft-efe-linalg)
     target_include_directories(dft-efe-linalg PUBLIC ${SLATE_DIR}/include)
     target_link_directories(dft-efe-linalg PUBLIC ${SLATE_DIR}/lib64)
 
-    target_link_libraries(dft-efe-linalg PUBLIC blaspp slate dft-efe-utils ${DFTEFE_LINEARALGEBRA_MPI_LIBRARIES})
+    target_link_libraries(dft-efe-linalg PUBLIC blaspp lapackpp slate gomp dft-efe-utils ${DFTEFE_LINEARALGEBRA_MPI_LIBRARIES})
 endif ()
diff --git a/src/linearAlgebra/DO_NOT_USE_ScalapckWrapper/README b/src/linearAlgebra/DO_NOT_USE_ScalapckWrapper/README
new file mode 100644
index 000000000..ac6d23e1c
--- /dev/null
+++ b/src/linearAlgebra/DO_NOT_USE_ScalapckWrapper/README
@@ -0,0 +1,4 @@
+This directory is adapted from Scalapack wrapper in DFT-FE and Deal.ii code. It contains the scalapack wrapper when we
+really need a fall back option if SLATE does not work.
+
+This project currently does NOT support Scalapck.
\ No newline at end of file
diff --git a/src/linearAlgebra/DO_NOT_USE_ScalapckWrapper/lapack_support.h b/src/linearAlgebra/DO_NOT_USE_ScalapckWrapper/lapack_support.h
new file mode 100644
index 000000000..614671841
--- /dev/null
+++ b/src/linearAlgebra/DO_NOT_USE_ScalapckWrapper/lapack_support.h
@@ -0,0 +1,233 @@
+// ---------------------------------------------------------------------
+//
+// Copyright (C) 2005 - 2018 by the deal.II authors
+//
+// This file is part of the deal.II library.
+//
+// The deal.II library is free software; you can use it, redistribute
+// it, and/or modify it under the terms of the GNU Lesser General
+// Public License as published by the Free Software Foundation; either
+// version 2.1 of the License, or (at your option) any later version.
+// The full text of the license can be found in the file LICENSE.md at
+// the top level directory of deal.II.
+//
+// ---------------------------------------------------------------------
+
+#ifndef dftfe_lapack_support_h
+#define dftfe_lapack_support_h
+
+
+#include <deal.II/base/exceptions.h>
+namespace dftefeScalapack
+{
+  namespace types
+  {
+#ifdef LAPACK_WITH_64BIT_BLAS_INDICES
+    /**
+     * Integer type in BLAS.
+     */
+    using blas_int = long long;
+#else
+    /**
+     * Integer type in BLAS.
+     */
+    using blas_int = int;
+#endif
+  } // namespace types
+
+  /**
+   * A namespace containing constants, exceptions, enumerations, and other
+   * utilities used by the deal.II LAPACK bindings.
+   */
+  namespace LAPACKSupport
+  {
+    /**
+     * Most of the LAPACK functions one can apply to a matrix (e.g., by calling
+     * the member functions of this class) change its content in some ways. For
+     * example, they may invert the matrix, or may replace it by a matrix whose
+     * columns represent the eigenvectors of the original content of the matrix.
+     * The elements of this enumeration are therefore used to track what is
+     * currently being stored by this object.
+     */
+    enum State
+    {
+      /// Contents is actually a matrix.
+      matrix,
+      /// Contents is the inverse of a matrix.
+      inverse_matrix,
+      /// Contents is an LU decomposition.
+      lu,
+      /// Contents is a Cholesky decomposition.
+      cholesky,
+      /// Eigenvalue vector is filled
+      eigenvalues,
+      /// Matrix contains singular value decomposition,
+      svd,
+      /// Matrix is the inverse of a singular value decomposition
+      inverse_svd,
+      /// Contents is something useless.
+      unusable = 0x8000
+    };
+
+    /**
+     * %Function printing the name of a State.
+     */
+    inline const char *
+    state_name(State s)
+    {
+      switch (s)
+        {
+          case matrix:
+            return "matrix";
+          case inverse_matrix:
+            return "inverse matrix";
+          case lu:
+            return "lu decomposition";
+          case cholesky:
+            return "cholesky decomposition";
+          case eigenvalues:
+            return "eigenvalues";
+          case svd:
+            return "svd";
+          case inverse_svd:
+            return "inverse_svd";
+          case unusable:
+            return "unusable";
+          default:
+            return "unknown";
+        }
+    }
+
+    /**
+     * A matrix can have certain features allowing for optimization, but hard to
+     * test. These are listed here.
+     */
+    enum Property
+    {
+      /// No special properties
+      general = 0,
+      /// Matrix is symmetric
+      hermitian = 1,
+      /// Matrix is upper triangular
+      upper_triangular = 2,
+      /// Matrix is lower triangular
+      lower_triangular = 4,
+      /// Matrix is diagonal
+      diagonal = 6,
+      /// Matrix is in upper Hessenberg form
+      hessenberg = 8
+    };
+
+    /**
+     * %Function printing the name of a Property.
+     */
+    inline const char *
+    property_name(const Property s)
+    {
+      switch (s)
+        {
+          case general:
+            return "general";
+          case hermitian:
+            return "hermitian";
+          case upper_triangular:
+            return "upper triangular";
+          case lower_triangular:
+            return "lower triangular";
+          case diagonal:
+            return "diagonal";
+          case hessenberg:
+            return "Hessenberg";
+        }
+
+      Assert(false, dealii::ExcNotImplemented());
+      return "invalid";
+    }
+
+    /**
+     * Character constant.
+     */
+    static const char A = 'A';
+    /**
+     * Character constant.
+     */
+    static const char N = 'N';
+    /**
+     * Character constant.
+     */
+    static const char O = 'O';
+    /**
+     * Character constant.
+     */
+    static const char T = 'T';
+    /**
+     * Character constant for conjugate transpose.
+     */
+    static const char C = 'C';
+    /**
+     * Character constant.
+     */
+    static const char U = 'U';
+    /**
+     * Character constant.
+     */
+    static const char L = 'L';
+    /**
+     * Character constant.
+     */
+    static const char V = 'V';
+    /**
+     * Integer constant.
+     */
+    static const types::blas_int zero = 0;
+    /**
+     * Integer constant.
+     */
+    static const types::blas_int one = 1;
+
+    /**
+     * A LAPACK function returned an error code.
+     */
+    DeclException2(ExcErrorCode,
+                   std::string,
+                   types::blas_int,
+                   << "The function " << arg1 << " returned with an error code "
+                   << arg2);
+
+    /**
+     * Exception thrown when a matrix is not in a suitable state for an
+     * operation. For instance, a LAPACK routine may have left the matrix in an
+     * unusable state, then vmult does not make sense anymore.
+     */
+    DeclException1(
+      ExcState,
+      State,
+      << "The function cannot be called while the matrix is in state "
+      << state_name(arg1));
+
+    /**
+     * Exception thrown when a matrix does not have suitable properties for an
+     * operation.
+     */
+    DeclException1(ExcProperty,
+                   Property,
+                   << "The function cannot be called with a "
+                   << property_name(arg1) << " matrix.");
+
+    /**
+     * This exception is thrown if a certain LAPACK function is not available
+     * because no LAPACK installation was detected during configuration.
+     */
+    DeclException1(
+      ExcMissing,
+      std::string,
+      << "When you ran 'cmake' during installation of deal.II, "
+      << "no suitable installation of the BLAS or LAPACK library could "
+      << "be found. Consequently, the function <" << arg1
+      << "> can not be called. Refer to the doc/readme.html "
+      << "file for information on how to ensure that deal.II "
+      << "picks up an existing BLAS and LAPACK installation at "
+      << "configuration time.");
+  } // namespace LAPACKSupport
+} // namespace dftefeScalapack
+#endif
diff --git a/src/linearAlgebra/DO_NOT_USE_ScalapckWrapper/process_grid.cpp b/src/linearAlgebra/DO_NOT_USE_ScalapckWrapper/process_grid.cpp
new file mode 100644
index 000000000..0430a6045
--- /dev/null
+++ b/src/linearAlgebra/DO_NOT_USE_ScalapckWrapper/process_grid.cpp
@@ -0,0 +1,273 @@
+// ---------------------------------------------------------------------
+//
+// Copyright (C) 2017 - 2019 by the deal.II authors
+//
+// This file is part of the deal.II library.
+//
+// The deal.II library is free software; you can use it, redistribute
+// it, and/or modify it under the terms of the GNU Lesser General
+// Public License as published by the Free Software Foundation; either
+// version 2.1 of the License, or (at your option) any later version.
+// The full text of the license can be found in the file LICENSE.md at
+// the top level directory of deal.II.
+//
+// ---------------------------------------------------------------------
+
+#include "process_grid.h"
+#include "scalapack.templates.h"
+#include <deal.II/base/mpi.templates.h>
+
+namespace dftefeScalapack
+{
+  namespace
+  {
+    /**
+     * Internal function to determine dimension of process grid based on the
+     * total number of cores, matrix dimensions and matrix block sizes
+     *
+     * Amesos heuristics:
+     * https://github.com/trilinos/Trilinos/blob/master/packages/amesos/src/Amesos_Scalapack.cpp#L142-L166
+     *
+     * Elemental default grid: El::Grid::Grid(mpi_comm,...)
+     * https://github.com/elemental/Elemental/blob/master/src/core/Grid.cpp#L67-L91
+     */
+    inline std::pair<int, int>
+    compute_processor_grid_sizes(const MPI_Comm &   mpi_comm,
+                                 const unsigned int m,
+                                 const unsigned int n,
+                                 const unsigned int block_size_m,
+                                 const unsigned int block_size_n)
+    {
+      // Few notes from the ScaLAPACK user guide:
+      // It is possible to predict the best grid shape given the number of
+      // processes available: Pr x Pc <= P This, however, depends on the task to
+      // be done. LU , QR and QL factorizations perform better for “flat”
+      // process grids (Pr < Pc ) For large N, Pc = 2*Pr is a good choice,
+      // whereas for small N, one should choose small Pr Square or near square
+      // grids are more optimal for Cholesky factorization. LQ and RQ
+      // factorizations take advantage of “tall” grids (Pr > Pc )
+
+      // Below we always try to create 2D processor grids:
+
+      const int n_processes = dealii::Utilities::MPI::n_mpi_processes(mpi_comm);
+
+      // Get the total number of cores we can occupy in a rectangular dense
+      // matrix with rectangular blocks when every core owns only a single
+      // block:
+      const int n_processes_heuristic =
+        int(std::ceil((1. * m) / block_size_m)) *
+        int(std::ceil((1. * n) / block_size_n));
+      const int Np = std::min(n_processes_heuristic, n_processes);
+
+      // Now we need to split Np into  Pr x Pc. Assume we know the shape/ratio
+      // Pc =: ratio * Pr
+      // therefore
+      // Np = Pc * Pc / ratio
+      // for quadratic matrices the ratio equals 1
+      const double ratio = double(n) / m;
+      int          Pc    = static_cast<int>(std::sqrt(ratio * Np));
+
+      // one could rounds up Pc to the number which has zero remainder from the
+      // division of Np while ( Np % Pc != 0 )
+      //  ++Pc;
+      // but this affects the grid shape dramatically, i.e. 10 cores 3x3 becomes
+      // 2x5.
+
+      // limit our estimate to be in [2, Np]
+      int n_process_columns = std::min(Np, std::max(2, Pc));
+      // finally, get the rows:
+      int n_process_rows = Np / n_process_columns;
+
+      Assert(n_process_columns >= 1 && n_process_rows >= 1 &&
+               n_processes >= n_process_rows * n_process_columns,
+             dealii::ExcMessage(
+               "error in process grid: " + std::to_string(n_process_rows) +
+               "x" + std::to_string(n_process_columns) + "=" +
+               std::to_string(n_process_rows * n_process_columns) + " out of " +
+               std::to_string(n_processes)));
+
+      return std::make_pair(n_process_rows, n_process_columns);
+
+      // For example,
+      // 320x320 with 32x32 blocks and 16 cores:
+      // Pc = 1.0 * Pr  => 4x4 grid
+      // Pc = 0.5 * Pr  => 8x2 grid
+      // Pc = 2.0 * Pr  => 3x5 grid
+    }
+  } // namespace
+
+
+  ProcessGrid::ProcessGrid(
+    const MPI_Comm &                             mpi_comm,
+    const std::pair<unsigned int, unsigned int> &grid_dimensions)
+    : mpi_communicator(mpi_comm)
+    , this_mpi_process(
+        dealii::Utilities::MPI::this_mpi_process(mpi_communicator))
+    , n_mpi_processes(dealii::Utilities::MPI::n_mpi_processes(mpi_communicator))
+    , n_process_rows(grid_dimensions.first)
+    , n_process_columns(grid_dimensions.second)
+  {
+    Assert(grid_dimensions.first > 0,
+           dealii::ExcMessage(
+             "Number of process grid rows has to be positive."));
+    Assert(grid_dimensions.second > 0,
+           dealii::ExcMessage(
+             "Number of process grid columns has to be positive."));
+
+    Assert(
+      grid_dimensions.first * grid_dimensions.second <= n_mpi_processes,
+      dealii::ExcMessage(
+        "Size of process grid is larger than number of available MPI processes."));
+
+    // processor grid order.
+    const bool column_major = false;
+
+    // Initialize Cblas context from the provided communicator
+    blacs_context     = Csys2blacs_handle(mpi_communicator);
+    const char *order = (column_major ? "Col" : "Row");
+    // Note that blacs_context can be modified below. Thus Cblacs2sys_handle
+    // may not return the same MPI communicator.
+    Cblacs_gridinit(&blacs_context, order, n_process_rows, n_process_columns);
+
+    // Blacs may modify the grid size on processes which are not used
+    // in the grid. So provide copies below:
+    int procrows_ = n_process_rows;
+    int proccols_ = n_process_columns;
+    Cblacs_gridinfo(blacs_context,
+                    &procrows_,
+                    &proccols_,
+                    &this_process_row,
+                    &this_process_column);
+
+    // If this MPI core is not on the grid, flag it as inactive and
+    // skip all jobs
+    // Note that a different condition is used in FORTRAN code here
+    // https://stackoverflow.com/questions/18516915/calling-blacs-with-more-processes-than-used
+    if (this_process_row < 0 || this_process_column < 0)
+      mpi_process_is_active = false;
+    else
+      mpi_process_is_active = true;
+
+    // Create an auxiliary communicator which has root and all inactive cores.
+    // Assume that inactive cores start with
+    // id=n_process_rows*n_process_columns
+    const unsigned int n_active_mpi_processes =
+      n_process_rows * n_process_columns;
+    Assert(mpi_process_is_active || this_mpi_process >= n_active_mpi_processes,
+           dealii::ExcInternalError());
+
+    std::vector<int> inactive_with_root_ranks;
+    inactive_with_root_ranks.push_back(0);
+    for (unsigned int i = n_active_mpi_processes; i < n_mpi_processes; ++i)
+      inactive_with_root_ranks.push_back(i);
+
+    // Get the group of processes in mpi_communicator
+    int       ierr = 0;
+    MPI_Group all_group;
+    ierr = MPI_Comm_group(mpi_communicator, &all_group);
+    AssertThrowMPI(ierr);
+
+    // Construct the group containing all ranks we need:
+    MPI_Group inactive_with_root_group;
+    const int n = inactive_with_root_ranks.size();
+    ierr        = MPI_Group_incl(all_group,
+                          n,
+                          inactive_with_root_ranks.data(),
+                          &inactive_with_root_group);
+    AssertThrowMPI(ierr);
+
+    // Create the communicator based on inactive_with_root_group.
+    // Note that on all the active MPI processes (except for the one with
+    // rank 0) the resulting MPI_Comm mpi_communicator_inactive_with_root
+    // will be MPI_COMM_NULL.
+    const int mpi_tag =
+      dealii::Utilities::MPI::internal::Tags::process_grid_constructor;
+
+    ierr = dealii::Utilities::MPI::create_group(
+      mpi_communicator,
+      inactive_with_root_group,
+      mpi_tag,
+      &mpi_communicator_inactive_with_root);
+    AssertThrowMPI(ierr);
+
+    ierr = MPI_Group_free(&all_group);
+    AssertThrowMPI(ierr);
+    ierr = MPI_Group_free(&inactive_with_root_group);
+    AssertThrowMPI(ierr);
+
+    // Double check that the process with rank 0 in subgroup is active:
+#ifdef DEBUG
+    if (mpi_communicator_inactive_with_root != MPI_COMM_NULL &&
+        dealii::Utilities::MPI::this_mpi_process(
+          mpi_communicator_inactive_with_root) == 0)
+      Assert(mpi_process_is_active, dealii::ExcInternalError());
+#endif
+  }
+
+
+
+  ProcessGrid::ProcessGrid(const MPI_Comm &   mpi_comm,
+                           const unsigned int n_rows_matrix,
+                           const unsigned int n_columns_matrix,
+                           const unsigned int row_block_size,
+                           const unsigned int column_block_size)
+    : ProcessGrid(mpi_comm,
+                  compute_processor_grid_sizes(mpi_comm,
+                                               n_rows_matrix,
+                                               n_columns_matrix,
+                                               row_block_size,
+                                               column_block_size))
+  {}
+
+
+
+  ProcessGrid::ProcessGrid(const MPI_Comm &   mpi_comm,
+                           const unsigned int n_rows,
+                           const unsigned int n_columns)
+    : ProcessGrid(mpi_comm, std::make_pair(n_rows, n_columns))
+  {}
+
+
+
+  ProcessGrid::~ProcessGrid()
+  {
+    if (mpi_process_is_active)
+      Cblacs_gridexit(blacs_context);
+
+    if (mpi_communicator_inactive_with_root != MPI_COMM_NULL)
+      MPI_Comm_free(&mpi_communicator_inactive_with_root);
+  }
+
+
+
+  template <typename NumberType>
+  void
+  ProcessGrid::send_to_inactive(NumberType *value, const int count) const
+  {
+    Assert(count > 0, dealii::ExcInternalError());
+    if (mpi_communicator_inactive_with_root != MPI_COMM_NULL)
+      {
+        const int ierr =
+          MPI_Bcast(value,
+                    count,
+                    dealii::Utilities::MPI::internal::mpi_type_id(value),
+                    0 /*from root*/,
+                    mpi_communicator_inactive_with_root);
+        AssertThrowMPI(ierr);
+      }
+  }
+} // namespace dftefeScalapack
+
+// instantiations
+
+template void
+dftefeScalapack::ProcessGrid::send_to_inactive<double>(double *,
+                                                       const int) const;
+template void
+dftefeScalapack::ProcessGrid::send_to_inactive<float>(float *, const int) const;
+template void
+dftefeScalapack::ProcessGrid::send_to_inactive<int>(int *, const int) const;
+template void
+dftefeScalapack::ProcessGrid::send_to_inactive<std::complex<double>>(
+  std::complex<double> *,
+  const int) const;
diff --git a/src/linearAlgebra/DO_NOT_USE_ScalapckWrapper/process_grid.h b/src/linearAlgebra/DO_NOT_USE_ScalapckWrapper/process_grid.h
new file mode 100644
index 000000000..ca68bdaca
--- /dev/null
+++ b/src/linearAlgebra/DO_NOT_USE_ScalapckWrapper/process_grid.h
@@ -0,0 +1,253 @@
+// ---------------------------------------------------------------------
+//
+// Copyright (C) 2017 - 2019 by the deal.II authors
+//
+// This file is part of the deal.II library.
+//
+// The deal.II library is free software; you can use it, redistribute
+// it, and/or modify it under the terms of the GNU Lesser General
+// Public License as published by the Free Software Foundation; either
+// version 2.1 of the License, or (at your option) any later version.
+// The full text of the license can be found in the file LICENSE.md at
+// the top level directory of deal.II.
+//
+// ---------------------------------------------------------------------
+
+#ifndef dftfe_process_grid_h
+#define dftfe_process_grid_h
+
+#include <mpi.h>
+#include <utility>
+
+
+namespace dftefeScalapack
+{
+// Forward declaration of class ScaLAPACKMatrix for ProcessGrid
+#ifndef DOXYGEN
+  template <typename NumberType>
+  class ScaLAPACKMatrix;
+#endif
+
+  /**
+   * A class taking care of setting up a two-dimensional processor grid.
+   * For example an MPI communicator with 5 processes can be arranged into a
+   * 2x2 grid with the 5-th processor being inactive:
+   * @code
+   *      |   0     |   1
+   * -----| ------- |-----
+   * 0    |   P0    |  P1
+   *      |         |
+   * -----| ------- |-----
+   * 1    |   P2    |  P3
+   * @endcode
+   *
+   * A shared pointer to this class is provided to ScaLAPACKMatrix matrices to
+   * perform block-cyclic distribution.
+   *
+   * Note that this class allows to setup a process grid which has fewer
+   * MPI cores than the total number of cores in the communicator.
+   *
+   * Currently the only place where one would use a ProcessGrid object is
+   * in connection with a ScaLAPACKMatrix object.
+   */
+  class ProcessGrid
+  {
+  public:
+    // Declare class ScaLAPACK as friend to provide access to private members.
+    template <typename NumberType>
+    friend class dftefeScalapack::ScaLAPACKMatrix;
+
+    /**
+     * Constructor for a process grid with @p n_rows and @p n_columns for a given @p mpi_communicator.
+     * The product of rows and columns should be less or equal to the total
+     * number of cores
+     * in the @p mpi_communicator.
+     */
+    ProcessGrid(const MPI_Comm &   mpi_communicator,
+                const unsigned int n_rows,
+                const unsigned int n_columns);
+
+    /**
+     * Constructor for a process grid for a given @p mpi_communicator.
+     * In this case the process grid is heuristically chosen based on the
+     * dimensions and block-cyclic distribution of a target matrix provided
+     * in @p n_rows_matrix, @p n_columns_matrix, @p row_block_size and @p column_block_size.
+     *
+     * The maximum number of MPI cores one can utilize is
+     * $\min\{\frac{M}{MB}\frac{N}{NB}, Np\}$, where $M,N$ are the matrix
+     * dimension and $MB,NB$ are the block sizes and $Np$ is the number of
+     * processes in the @p mpi_communicator. This function then creates a 2D processor grid
+     * assuming the ratio between number of process row $p$ and columns $q$ to
+     * be equal the ratio between matrix dimensions $M$ and $N$.
+     *
+     * For example, a square matrix $640x640$ with the block size $32$
+     * and the @p mpi_communicator with 11 cores will result in the $3x3$
+     * process grid.
+     */
+    ProcessGrid(const MPI_Comm &   mpi_communicator,
+                const unsigned int n_rows_matrix,
+                const unsigned int n_columns_matrix,
+                const unsigned int row_block_size,
+                const unsigned int column_block_size);
+
+    /**
+     * Destructor.
+     */
+    ~ProcessGrid();
+
+    /**
+     * Return the number of rows in the processes grid.
+     */
+    unsigned int
+    get_process_grid_rows() const;
+
+    /**
+     * Return the number of columns in the processes grid.
+     */
+    unsigned int
+    get_process_grid_columns() const;
+
+    /**
+     * Return row of this process in the process grid.
+     *
+     * It's negative for inactive processes.
+     */
+    int
+    get_this_process_row() const;
+
+    /**
+     * Return column of this process in the process grid.
+     *
+     * It's negative for inactive processes.
+     */
+    int
+    get_this_process_column() const;
+
+    /**
+     * Send @p count values stored consequently starting at @p value from
+     * the process with rank zero to processes which
+     * are not in the process grid.
+     */
+    template <typename NumberType>
+    void
+    send_to_inactive(NumberType *value, const int count = 1) const;
+
+    /**
+     * Return <code>true</code> if the process is active within the grid.
+     */
+    bool
+    is_process_active() const;
+
+  private:
+    /**
+     * A private constructor which takes grid dimensions as an
+     * <code>std::pair</code>.
+     */
+    ProcessGrid(const MPI_Comm &                             mpi_communicator,
+                const std::pair<unsigned int, unsigned int> &grid_dimensions);
+
+    /**
+     * An MPI communicator with all processes (active and inactive).
+     */
+    MPI_Comm mpi_communicator;
+
+    /**
+     * An MPI communicator with inactive processes and the process with rank
+     * zero.
+     */
+    MPI_Comm mpi_communicator_inactive_with_root;
+
+    /**
+     * BLACS context. This is equivalent to MPI communicators and is
+     * used by ScaLAPACK.
+     */
+    int blacs_context;
+
+    /**
+     * Rank of this MPI process.
+     */
+    const unsigned int this_mpi_process;
+
+    /**
+     * Total number of MPI processes.
+     */
+    const unsigned int n_mpi_processes;
+
+    /**
+     * Number of rows in the processes grid.
+     */
+    int n_process_rows;
+
+    /**
+     * Number of columns in the processes grid.
+     */
+    int n_process_columns;
+
+    /**
+     * Row of this process in the grid.
+     *
+     * It's negative for in-active processes.
+     */
+    int this_process_row;
+
+    /**
+     * Column of this process in the grid.
+     *
+     * It's negative for in-active processes.
+     */
+    int this_process_column;
+
+    /**
+     * A flag which is true for processes within the 2D process grid.
+     */
+    bool mpi_process_is_active;
+  };
+
+  /*--------------------- Inline functions --------------------------------*/
+
+#ifndef DOXYGEN
+
+  inline unsigned int
+  ProcessGrid::get_process_grid_rows() const
+  {
+    return n_process_rows;
+  }
+
+
+
+  inline unsigned int
+  ProcessGrid::get_process_grid_columns() const
+  {
+    return n_process_columns;
+  }
+
+
+
+  inline int
+  ProcessGrid::get_this_process_row() const
+  {
+    return this_process_row;
+  }
+
+
+
+  inline int
+  ProcessGrid::get_this_process_column() const
+  {
+    return this_process_column;
+  }
+
+
+
+  inline bool
+  ProcessGrid::is_process_active() const
+  {
+    return mpi_process_is_active;
+  }
+
+
+#endif // ifndef DOXYGEN
+
+} // namespace dftefeScalapack
+
+#endif
diff --git a/src/linearAlgebra/DO_NOT_USE_ScalapckWrapper/scalapack.templates.h b/src/linearAlgebra/DO_NOT_USE_ScalapckWrapper/scalapack.templates.h
new file mode 100644
index 000000000..1c0fec41f
--- /dev/null
+++ b/src/linearAlgebra/DO_NOT_USE_ScalapckWrapper/scalapack.templates.h
@@ -0,0 +1,2803 @@
+// ---------------------------------------------------------------------
+//
+// Copyright (C) 2017 - 2019 by the deal.II authors
+//
+// This file is part of the deal.II library.
+//
+// The deal.II library is free software; you can use it, redistribute
+// it, and/or modify it under the terms of the GNU Lesser General
+// Public License as published by the Free Software Foundation; either
+// version 2.1 of the License, or (at your option) any later version.
+// The full text of the license can be found in the file LICENSE.md at
+// the top level directory of deal.II.
+//
+// ---------------------------------------------------------------------
+
+#ifndef dftfe_scalapack_templates_h
+#define dftfe_scalapack_templates_h
+
+
+#include <mpi.h>
+#include <complex>
+#include <vector>
+#include <deal.II/base/exceptions.h>
+
+#ifdef DEAL_II_HAVE_FP_EXCEPTIONS
+#  include <cfenv>
+#endif
+
+// useful examples:
+// https://stackoverflow.com/questions/14147705/cholesky-decomposition-scalapack-error/14203864
+// http://icl.cs.utk.edu/lapack-forum/viewtopic.php?t=139   // second post by
+// Julien Langou
+// https://andyspiros.wordpress.com/2011/07/08/an-example-of-blacs-with-c/
+// http://qboxcode.org/trac/browser/qb/tags/rel1_63_4/src/Matrix.C
+// https://gitlab.phys.ethz.ch/lwossnig/lecture/blob/a534f562dfb2ad5c564abe5c2356d5d956fb7218/examples/mpi/scalapack.cpp
+// https://github.com/elemental/Elemental/blob/master/src/core/imports/scalapack.cpp
+// https://scicomp.stackexchange.com/questions/7766/performance-optimization-or-tuning-possible-for-scalapack-gemm
+//
+// info:
+// http://www.netlib.org/scalapack/slug/index.html       // User guide
+// http://www.netlib.org/scalapack/slug/node135.html // How to Measure Errors
+// NOTE: Non-templated functions are chosen over templated functions if their
+// names match and template function is not explicitly called
+namespace dftefeScalapack
+{
+  extern "C"
+  {
+    /* Basic Linear Algebra Communication Subprograms (BLACS) declarations */
+    // https://www.ibm.com/support/knowledgecenter/SSNR5K_4.2.0/com.ibm.cluster.pessl.v4r2.pssl100.doc/am6gr_dinitb.htm#dinitb
+
+    /**
+     * Determine how many processes are available and the current process rank.
+     *
+     * https://www.ibm.com/support/knowledgecenter/en/SSNR5K_4.2.0/com.ibm.cluster.pessl.v4r2.pssl100.doc/am6gr_dbpnf.htm
+     */
+    void
+    Cblacs_pinfo(int *rank, int *nprocs);
+
+    /**
+     * Return internal BLACS value in @p val based on the input @p what and @p icontxt.
+     * The most common use is in retrieving a default system context (@p what = 0, @p icontxt is ignored)
+     * to be used in BLACS_GRIDINIT or BLACS_GRIDMAP.
+     *
+     * https://www.ibm.com/support/knowledgecenter/en/SSNR5K_4.2.0/com.ibm.cluster.pessl.v4r2.pssl100.doc/am6gr_dbget.htm
+     */
+    void
+    Cblacs_get(int icontxt, int what, int *val);
+
+    /**
+     * Map the processes sequentially in row-major or column-major order
+     * into the process grid. Input arguments must be the same on every process.
+     *
+     * On return, @p context is the integer handle to the BLACS context,
+     * whereas on entry it is a system context to be used in creating the
+     * BLACS context.
+     *
+     * https://www.ibm.com/support/knowledgecenter/en/SSNR5K_4.2.0/com.ibm.cluster.pessl.v4r2.pssl100.doc/am6gr_dbint.htm
+     */
+    void
+    Cblacs_gridinit(int *       context,
+                    const char *order,
+                    int         grid_height,
+                    int         grid_width);
+
+    /**
+     * Return the process row and column index.
+     *
+     * https://www.ibm.com/support/knowledgecenter/en/SSNR5K_4.2.0/com.ibm.cluster.pessl.v4r2.pssl100.doc/am6gr_dbinfo.htm
+     */
+    void
+    Cblacs_gridinfo(int  context,
+                    int *grid_height,
+                    int *grid_width,
+                    int *grid_row,
+                    int *grid_col);
+
+    /**
+     * Given the system process number, return the row and column coordinates in
+     * the BLACS' process grid.
+     */
+    void
+    Cblacs_pcoord(int ictxt, int pnum, int *prow, int *pcol);
+
+    /**
+     * Release a BLACS context.
+     */
+    void
+    Cblacs_gridexit(int context);
+
+    /**
+     * This routines holds up execution of all processes within the indicated
+     * scope until they have all called the routine.
+     */
+    void
+    Cblacs_barrier(int, const char *);
+
+    /**
+     * Free all BLACS contexts and releases all allocated memory.
+     */
+    void
+    Cblacs_exit(int error_code);
+
+    /**
+     * Receives a message from a process @prsrc, @p csrc into a general rectangular matrix.
+     *
+     * https://software.intel.com/en-us/mkl-developer-reference-c-gerv2d
+     */
+    void
+    Cdgerv2d(int context, int M, int N, double *A, int lda, int rsrc, int csrc);
+    void
+    Csgerv2d(int context, int M, int N, float *A, int lda, int rsrc, int csrc);
+
+    /**
+     * Sends the general rectangular matrix A to the destination
+     * process @p rdest @p cdest in the process grid.
+     *
+     * https://software.intel.com/en-us/mkl-developer-reference-c-2018-beta-gesd2d
+     */
+    void
+    Cdgesd2d(int     context,
+             int     M,
+             int     N,
+             double *A,
+             int     lda,
+             int     rdest,
+             int     cdest);
+    void
+    Csgesd2d(int    context,
+             int    M,
+             int    N,
+             float *A,
+             int    lda,
+             int    rdest,
+             int    cdest);
+
+    /**
+     * Get BLACS context from MPI @p comm.
+     */
+    int
+    Csys2blacs_handle(MPI_Comm comm);
+
+    /**
+     * Compute how many rows and columns each process owns (NUMber of Rows Or
+     * Columns).
+     *
+     * https://www.ibm.com/support/knowledgecenter/SSNR5K_4.2.0/com.ibm.cluster.pessl.v4r2.pssl100.doc/am6gr_dnumy.htm
+     */
+    int
+    numroc_(const int *n,
+            const int *nb,
+            const int *iproc,
+            const int *isproc,
+            const int *nprocs);
+    /**
+     * Compute complex conjugate
+     */
+    void
+    pzlacgv_(const int *           N,
+             std::complex<double> *A,
+             const int *           IA,
+             const int *           JA,
+             const int *           DESCA,
+             const int *           INCX);
+
+    /**
+     * Compute the Cholesky factorization of an N-by-N real
+     * symmetric positive definite distributed matrix sub( A ) denoting
+     * A(IA:IA+N-1, JA:JA+N-1).
+     *
+     * http://www.netlib.org/scalapack/explore-html/d5/d9e/pdpotrf_8f_source.html
+     * https://www.ibm.com/support/knowledgecenter/SSNR5K_4.2.0/com.ibm.cluster.pessl.v4r2.pssl100.doc/am6gr_lpotrf.htm
+     */
+    void
+    pdpotrf_(const char *UPLO,
+             const int * N,
+             double *    A,
+             const int * IA,
+             const int * JA,
+             const int * DESCA,
+             int *       INFO);
+    void
+    pspotrf_(const char *UPLO,
+             const int * N,
+             float *     A,
+             const int * IA,
+             const int * JA,
+             const int * DESCA,
+             int *       INFO);
+    void
+    pzpotrf_(const char *          UPLO,
+             const int *           N,
+             std::complex<double> *A,
+             const int *           IA,
+             const int *           JA,
+             const int *           DESCA,
+             int *                 INFO);
+
+    /**
+     * Computes an LU factorization of a general distributed matrix sub( A )
+     * using partial pivoting with row interchanges.
+     *
+     * http://www.netlib.org/scalapack/explore-html/df/dfe/pdgetrf_8f_source.html
+     * https://www.ibm.com/support/knowledgecenter/en/SSNR5K_4.2.0/com.ibm.cluster.pessl.v4r2.pssl100.doc/am6gr_lgetrf.htm
+     */
+    void
+    pdgetrf_(const int *m,
+             const int *n,
+             double *   A,
+             const int *IA,
+             const int *JA,
+             const int *DESCA,
+             int *      ipiv,
+             int *      INFO);
+    void
+    psgetrf_(const int *m,
+             const int *n,
+             float *    A,
+             const int *IA,
+             const int *JA,
+             const int *DESCA,
+             int *      ipiv,
+             int *      INFO);
+    void
+    pzgetrf_(const int *           m,
+             const int *           n,
+             std::complex<double> *A,
+             const int *           IA,
+             const int *           JA,
+             const int *           DESCA,
+             int *                 ipiv,
+             int *                 INFO);
+    /**
+     * Compute the inverse of a real symmetric positive definite
+     * distributed matrix sub( A ) = A(IA:IA+N-1,JA:JA+N-1) using the
+     * Cholesky factorization sub( A ) = U**T*U or L*L**T computed by
+     * PDPOTRF.
+     *
+     * http://www.netlib.org/scalapack/explore-html/d2/d44/pdpotri_8f_source.html
+     * https://www.ibm.com/support/knowledgecenter/SSNR5K_4.2.0/com.ibm.cluster.pessl.v4r2.pssl100.doc/am6gr_lpotri.htm
+     * https://software.intel.com/en-us/mkl-developer-reference-c-p-potri
+     */
+    void
+    pdpotri_(const char *UPLO,
+             const int * N,
+             double *    A,
+             const int * IA,
+             const int * JA,
+             const int * DESCA,
+             int *       INFO);
+    void
+    pspotri_(const char *UPLO,
+             const int * N,
+             float *     A,
+             const int * IA,
+             const int * JA,
+             const int * DESCA,
+             int *       INFO);
+    void
+    pzpotri_(const char *          UPLO,
+             const int *           N,
+             std::complex<double> *A,
+             const int *           IA,
+             const int *           JA,
+             const int *           DESCA,
+             int *                 INFO);
+
+    /**
+     * PDGETRI computes the inverse of a distributed matrix using the LU
+     * factorization computed by PDGETRF. This method inverts U and then
+     * computes the inverse of sub( A ) = A(IA:IA+N-1,JA:JA+N-1) denoted
+     * InvA by solving the system InvA*L = inv(U) for InvA.
+     *
+     * http://www.netlib.org/scalapack/explore-html/d3/df3/pdgetri_8f_source.html
+     * https://www.ibm.com/support/knowledgecenter/SSNR5K_4.2.0/com.ibm.cluster.pessl.v4r2.pssl100.doc/am6gr_lgetri.htm
+     */
+    void
+    pdgetri_(const int *N,
+             double *   A,
+             const int *IA,
+             const int *JA,
+             const int *DESCA,
+             const int *ipiv,
+             double *   work,
+             int *      lwork,
+             int *      iwork,
+             int *      liwork,
+             int *      info);
+    void
+    psgetri_(const int *N,
+             float *    A,
+             const int *IA,
+             const int *JA,
+             const int *DESCA,
+             const int *ipiv,
+             float *    work,
+             int *      lwork,
+             int *      iwork,
+             int *      liwork,
+             int *      info);
+    void
+    pzgetri_(const int *           N,
+             std::complex<double> *A,
+             const int *           IA,
+             const int *           JA,
+             const int *           DESCA,
+             const int *           ipiv,
+             std::complex<double> *work,
+             int *                 lwork,
+             int *                 iwork,
+             int *                 liwork,
+             int *                 info);
+
+    /**
+     * PDTRTRI computes the inverse of a upper or lower triangular
+     * distributed matrix sub( A ) = A(IA:IA+N-1,JA:JA+N-1).
+     *
+     * http://www.netlib.org/scalapack/explore-html/d9/dc0/pdtrtri_8f_source.html
+     * https://www.ibm.com/support/knowledgecenter/SSNR5K_4.2.0/com.ibm.cluster.pessl.v4r2.pssl100.doc/am6gr_lpdtri.htm
+     * https://software.intel.com/en-us/mkl-developer-reference-c-p-trtri
+     */
+    void
+    pdtrtri_(const char *UPLO,
+             const char *DIAG,
+             const int * N,
+             double *    A,
+             const int * IA,
+             const int * JA,
+             const int * DESCA,
+             int *       INFO);
+    void
+    pstrtri_(const char *UPLO,
+             const char *DIAG,
+             const int * N,
+             float *     A,
+             const int * IA,
+             const int * JA,
+             const int * DESCA,
+             int *       INFO);
+
+    void
+    pztrtri_(const char *          UPLO,
+             const char *          DIAG,
+             const int *           N,
+             std::complex<double> *A,
+             const int *           IA,
+             const int *           JA,
+             const int *           DESCA,
+             int *                 INFO);
+
+    /**
+     * Estimate the reciprocal of the condition number (in the
+     * l1-norm) of a real symmetric positive definite distributed matrix
+     * using the Cholesky factorization.
+     *
+     * https://www.ibm.com/support/knowledgecenter/SSNR5K_4.2.0/com.ibm.cluster.pessl.v4r2.pssl100.doc/am6gr_lpocon.htm#lpocon
+     * http://www.netlib.org/scalapack/explore-html/d4/df7/pdpocon_8f.html
+     * https://software.intel.com/en-us/mkl-developer-reference-fortran-pocon
+     */
+    void
+    pdpocon_(const char *  uplo,
+             const int *   N,
+             const double *A,
+             const int *   IA,
+             const int *   JA,
+             const int *   DESCA,
+             const double *ANORM,
+             double *      RCOND,
+             double *      WORK,
+             const int *   LWORK,
+             int *         IWORK,
+             const int *   LIWORK,
+             int *         INFO);
+    void
+    pspocon_(const char * uplo,
+             const int *  N,
+             const float *A,
+             const int *  IA,
+             const int *  JA,
+             const int *  DESCA,
+             const float *ANORM,
+             float *      RCOND,
+             float *      WORK,
+             const int *  LWORK,
+             int *        IWORK,
+             const int *  LIWORK,
+             int *        INFO);
+
+    /**
+     * Norm of a real symmetric matrix
+     *
+     * http://www.netlib.org/scalapack/explore-html/dd/d12/pdlansy_8f_source.html
+     * https://www.ibm.com/support/knowledgecenter/SSNR5K_4.2.0/com.ibm.cluster.pessl.v4r2.pssl100.doc/am6gr_pdlansy.htm#pdlansy
+     */
+    double
+    pdlansy_(const char *  norm,
+             const char *  uplo,
+             const int *   N,
+             const double *A,
+             const int *   IA,
+             const int *   JA,
+             const int *   DESCA,
+             double *      work);
+    float
+    pslansy_(const char * norm,
+             const char * uplo,
+             const int *  N,
+             const float *A,
+             const int *  IA,
+             const int *  JA,
+             const int *  DESCA,
+             float *      work);
+
+    /**
+     * Compute the Least Common Multiple (LCM) of two positive integers @p M and @p N.
+     * In fact the routine Compute the greatest common divisor (GCD) and
+     * use the fact that M*N = GCD*LCM.
+     *
+     * http://www.netlib.org/scalapack/explore-html/d0/d9b/ilcm_8f_source.html
+     */
+    int
+    ilcm_(const int *M, const int *N);
+
+    /**
+     * Return the ceiling of the division of two integers.
+     *
+     * http://www.netlib.org/scalapack/explore-html/df/d07/iceil_8f_source.html
+     */
+    int
+    iceil_(const int *i1, const int *i2);
+
+    /**
+     * Initialize the descriptor vector with the 8 input arguments
+     */
+    void
+    descinit_(int *      desc,
+              const int *m,
+              const int *n,
+              const int *mb,
+              const int *nb,
+              const int *irsrc,
+              const int *icsrc,
+              const int *ictxt,
+              const int *lld,
+              int *      info);
+
+    /**
+     * Compute the global index of a distributed matrix entry
+     * pointed to by the local index @p indxloc of the process indicated by
+     * @p iproc.
+     *
+     * @param indxloc The local index of the distributed matrix entry.
+     * @param nb Block size, size of the blocks the distributed matrix is split
+     * into.
+     * @param iproc The coordinate of the process whose local array row or column
+     * is to be determined
+     * @param isrcproc  The coordinate of the process that possesses the first
+     * row/column of the distributed matrix
+     * @param nprocs The total number processes over which the distributed matrix
+     * is distributed
+     */
+    int
+    indxl2g_(const int *indxloc,
+             const int *nb,
+             const int *iproc,
+             const int *isrcproc,
+             const int *nprocs);
+
+    /**
+     * Compute the solution to a real system of linear equations
+     */
+    void
+    pdgesv_(const int *n,
+            const int *nrhs,
+            double *   A,
+            const int *ia,
+            const int *ja,
+            const int *desca,
+            int *      ipiv,
+            double *   B,
+            const int *ib,
+            const int *jb,
+            const int *descb,
+            int *      info);
+    void
+    psgesv_(const int *n,
+            const int *nrhs,
+            float *    A,
+            const int *ia,
+            const int *ja,
+            const int *desca,
+            int *      ipiv,
+            float *    B,
+            const int *ib,
+            const int *jb,
+            const int *descb,
+            int *      info);
+
+    /**
+     * Perform one of the matrix-matrix operations:
+     * @f{align*}{
+     * \mathrm{sub}(C) &\dealcoloneq \alpha
+     * op(\mathrm{sub}(A))op(\mathrm{sub}(B))
+     *                            + \beta \mathrm{sub}(C), \\
+     * \mathrm{sub}(C) &\dealcoloneq \alpha
+     * op(\mathrm{sub}(A))op(\mathrm{sub}(B))
+     *                            + beta sub(C),
+     * @f}
+     * where
+     * $\mathrm{sub}(C)$ denotes C(IC:IC+M-1,JC:JC+N-1),  and, $op(X)$ is one of
+     * $op(X) = X$ or $op(X) = X^T$.
+     */
+    void
+    pdgemm_(const char *  transa,
+            const char *  transb,
+            const int *   m,
+            const int *   n,
+            const int *   k,
+            const double *alpha,
+            const double *A,
+            const int *   IA,
+            const int *   JA,
+            const int *   DESCA,
+            const double *B,
+            const int *   IB,
+            const int *   JB,
+            const int *   DESCB,
+            const double *beta,
+            double *      C,
+            const int *   IC,
+            const int *   JC,
+            const int *   DESCC);
+    void
+    psgemm_(const char * transa,
+            const char * transb,
+            const int *  m,
+            const int *  n,
+            const int *  k,
+            const float *alpha,
+            const float *A,
+            const int *  IA,
+            const int *  JA,
+            const int *  DESCA,
+            const float *B,
+            const int *  IB,
+            const int *  JB,
+            const int *  DESCB,
+            const float *beta,
+            float *      C,
+            const int *  IC,
+            const int *  JC,
+            const int *  DESCC);
+
+    void
+    pzgemm_(const char *                transa,
+            const char *                transb,
+            const int *                 m,
+            const int *                 n,
+            const int *                 k,
+            const std::complex<double> *alpha,
+            const std::complex<double> *A,
+            const int *                 IA,
+            const int *                 JA,
+            const int *                 DESCA,
+            const std::complex<double> *B,
+            const int *                 IB,
+            const int *                 JB,
+            const int *                 DESCB,
+            const std::complex<double> *beta,
+            std::complex<double> *      C,
+            const int *                 IC,
+            const int *                 JC,
+            const int *                 DESCC);
+
+    /**
+     * Return the value of the one norm, or the Frobenius norm, or the infinity
+     * norm, or the element of largest absolute value of a distributed matrix
+     */
+    double
+    pdlange_(char const *  norm,
+             const int *   m,
+             const int *   n,
+             const double *A,
+             const int *   ia,
+             const int *   ja,
+             const int *   desca,
+             double *      work);
+    float
+    pslange_(const char * norm,
+             const int *  m,
+             const int *  n,
+             const float *A,
+             const int *  ia,
+             const int *  ja,
+             const int *  desca,
+             float *      work);
+
+    /**
+     * Compute the process coordinate which possesses the entry of a
+     * distributed matrix specified by a global index
+     */
+    int
+    indxg2p_(const int *glob,
+             const int *nb,
+             const int *iproc,
+             const int *isproc,
+             const int *nprocs);
+
+    /**
+     * Compute all eigenvalues and, optionally, eigenvectors of a real symmetric
+     * matrix A by calling the recommended sequence of ScaLAPACK routines. In
+     * its present form, the routine assumes a homogeneous system and makes no
+     * checks for consistency of the eigenvalues or eigenvectors across the
+     * different processes. Because of this, it is possible that a heterogeneous
+     * system may return incorrect results without any error messages.
+     *
+     * http://www.netlib.org/scalapack/explore-html/d0/d1a/pdsyev_8f.html
+     * https://www.ibm.com/support/knowledgecenter/SSNR5K_4.2.0/com.ibm.cluster.pessl.v4r2.pssl100.doc/am6gr_lsyev.htm#lsyev
+     */
+    void
+    pdsyev_(const char *jobz,
+            const char *uplo,
+            const int * m,
+            double *    A,
+            const int * ia,
+            const int * ja,
+            int *       desca,
+            double *    w,
+            double *    z,
+            const int * iz,
+            const int * jz,
+            int *       descz,
+            double *    work,
+            const int * lwork,
+            int *       info);
+    void
+    pssyev_(const char *jobz,
+            const char *uplo,
+            const int * m,
+            float *     A,
+            const int * ia,
+            const int * ja,
+            int *       desca,
+            float *     w,
+            float *     z,
+            const int * iz,
+            const int * jz,
+            int *       descz,
+            float *     work,
+            const int * lwork,
+            int *       info);
+    void
+    pzheev_(const char *          jobz,
+            const char *          uplo,
+            const int *           m,
+            std::complex<double> *A,
+            const int *           ia,
+            const int *           ja,
+            int *                 desca,
+            double *              w,
+            std::complex<double> *z,
+            const int *           iz,
+            const int *           jz,
+            int *                 descz,
+            std::complex<double> *work,
+            const int *           lwork,
+            int *                 info);
+
+    /**
+     * Copy all or a part of a distributed matrix A to another distributed
+     * matrix B. No communication is performed, pdlacpy performs a local copy
+     * $\mathrm{sub}(A) \dealcoloneq \mathrm{sub}(B)$, where $\mathrm{sub}(A)$
+     * denotes $A(ia:ia+m-1, ja:ja+n-1)$ and $\mathrm{sub}(B)$ denotes
+     * $B(ib:ib+m-1, jb:jb+n-1)$.
+     */
+    void
+    pdlacpy_(const char *  uplo,
+             const int *   m,
+             const int *   n,
+             const double *A,
+             const int *   ia,
+             const int *   ja,
+             const int *   desca,
+             double *      B,
+             const int *   ib,
+             const int *   jb,
+             const int *   descb);
+    void
+    pslacpy_(const char * uplo,
+             const int *  m,
+             const int *  n,
+             const float *A,
+             const int *  ia,
+             const int *  ja,
+             const int *  desca,
+             float *      B,
+             const int *  ib,
+             const int *  jb,
+             const int *  descb);
+
+    /**
+     * Copies the content of a general rectangular distributed matrix @p A to another distributed matrix @p B
+     * It is not required that the matrices A and B have the same process grid
+     * or block size, e.g. copying a matrix from a one-dimensional to a
+     * two-dimensional process grid
+     * @p ictxt is a context which is at least a union of all processes in
+     * context A and B
+     */
+    void
+    pdgemr2d_(const int *   m,
+              const int *   n,
+              const double *A,
+              const int *   ia,
+              const int *   ja,
+              const int *   desca,
+              double *      B,
+              const int *   ib,
+              const int *   jb,
+              const int *   descb,
+              const int *   ictxt);
+    void
+    psgemr2d_(const int *  m,
+              const int *  n,
+              const float *A,
+              const int *  ia,
+              const int *  ja,
+              const int *  desca,
+              float *      B,
+              const int *  ib,
+              const int *  jb,
+              const int *  descb,
+              const int *  ictxt);
+
+    /**
+     * helper routines determining machine precision
+     */
+    double
+    pdlamch_(const int *ictxt, const char *cmach);
+    float
+    pslamch_(const int *ictxt, const char *cmach);
+
+
+    /**
+     *  psyevx computes selected eigenvalues and, optionally, eigenvectors
+     *  of a real symmetric matrix A. Eigenvalues/vectors can be selected by
+     *  specifying a range of values or a range of indices for the desired
+     *  eigenvalues.
+     */
+    void
+    pdsyevx_(const char *  jobz,
+             const char *  range,
+             const char *  uplo,
+             const int *   n,
+             double *      A,
+             const int *   ia,
+             const int *   ja,
+             const int *   desca,
+             const double *VL,
+             const double *VU,
+             const int *   il,
+             const int *   iu,
+             const double *abstol,
+             const int *   m,
+             const int *   nz,
+             double *      w,
+             double *      orfac,
+             double *      Z,
+             const int *   iz,
+             const int *   jz,
+             const int *   descz,
+             double *      work,
+             int *         lwork,
+             int *         iwork,
+             int *         liwork,
+             int *         ifail,
+             int *         iclustr,
+             double *      gap,
+             int *         info);
+    void
+    pssyevx_(const char * jobz,
+             const char * range,
+             const char * uplo,
+             const int *  n,
+             float *      A,
+             const int *  ia,
+             const int *  ja,
+             const int *  desca,
+             const float *VL,
+             const float *VU,
+             const int *  il,
+             const int *  iu,
+             const float *abstol,
+             const int *  m,
+             const int *  nz,
+             float *      w,
+             float *      orfac,
+             float *      Z,
+             const int *  iz,
+             const int *  jz,
+             const int *  descz,
+             float *      work,
+             int *        lwork,
+             int *        iwork,
+             int *        liwork,
+             int *        ifail,
+             int *        iclustr,
+             float *      gap,
+             int *        info);
+    void
+    pzheevx_(const char *          jobz,
+             const char *          range,
+             const char *          uplo,
+             const int *           n,
+             std::complex<double> *A,
+             const int *           ia,
+             const int *           ja,
+             const int *           desca,
+             const double *        VL,
+             const double *        VU,
+             const int *           il,
+             const int *           iu,
+             const double *        abstol,
+             const int *           m,
+             const int *           nz,
+             double *              w,
+             double *              orfac,
+             std::complex<double> *Z,
+             const int *           iz,
+             const int *           jz,
+             const int *           descz,
+             std::complex<double> *work,
+             int *                 lwork,
+             int *                 iwork,
+             int *                 liwork,
+             int *                 ifail,
+             int *                 iclustr,
+             double *              gap,
+             int *                 info);
+
+    /*
+     * PDGESVD computes the singular value decomposition (SVD) of an
+     * M-by-N matrix A, optionally computing the left and/or right
+     * singular vectors
+     */
+    void
+    pdgesvd_(const char *jobu,
+             const char *jobvt,
+             const int * m,
+             const int * n,
+             double *    A,
+             const int * ia,
+             const int * ja,
+             const int * desca,
+             double *    S,
+             double *    U,
+             const int * iu,
+             const int * ju,
+             const int * descu,
+             double *    VT,
+             const int * ivt,
+             const int * jvt,
+             const int * descvt,
+             double *    work,
+             int *       lwork,
+             int *       info);
+    void
+    psgesvd_(const char *jobu,
+             const char *jobvt,
+             const int * m,
+             const int * n,
+             float *     A,
+             const int * ia,
+             const int * ja,
+             const int * desca,
+             float *     S,
+             float *     U,
+             const int * iu,
+             const int * ju,
+             const int * descu,
+             float *     VT,
+             const int * ivt,
+             const int * jvt,
+             const int * descvt,
+             float *     work,
+             int *       lwork,
+             int *       info);
+
+    /*
+     * P_GELS solves overdetermined or underdetermined real linear
+     * systems involving an M-by-N matrix A, or its transpose,
+     * using a QR or LQ factorization of A.  It is assumed that A has full rank.
+     */
+    void
+    pdgels_(const char *trans,
+            const int * m,
+            const int * n,
+            const int * nrhs,
+            double *    A,
+            const int * ia,
+            const int * ja,
+            const int * desca,
+            double *    B,
+            const int * ib,
+            const int * jb,
+            const int * descb,
+            double *    work,
+            int *       lwork,
+            int *       info);
+    void
+    psgels_(const char *trans,
+            const int * m,
+            const int * n,
+            const int * nrhs,
+            float *     A,
+            const int * ia,
+            const int * ja,
+            const int * desca,
+            float *     B,
+            const int * ib,
+            const int * jb,
+            const int * descb,
+            float *     work,
+            int *       lwork,
+            int *       info);
+
+    /*
+     * Perform matrix sum:
+     * @f{equation*}
+     * C \dealcoloneq \beta C + \alpha op(A),
+     * @f
+     * where $op(A)$ denotes either $op(A) = A$ or $op(A)=A^T$.
+     */
+    void
+    pdgeadd_(const char *  transa,
+             const int *   m,
+             const int *   n,
+             const double *alpha,
+             const double *A,
+             const int *   IA,
+             const int *   JA,
+             const int *   DESCA,
+             const double *beta,
+             double *      C,
+             const int *   IC,
+             const int *   JC,
+             const int *   DESCC);
+    void
+    psgeadd_(const char * transa,
+             const int *  m,
+             const int *  n,
+             const float *alpha,
+             const float *A,
+             const int *  IA,
+             const int *  JA,
+             const int *  DESCA,
+             const float *beta,
+             float *      C,
+             const int *  IC,
+             const int *  JC,
+             const int *  DESCC);
+    void
+    pzgeadd_(const char *                transa,
+             const int *                 m,
+             const int *                 n,
+             const std::complex<double> *alpha,
+             const std::complex<double> *A,
+             const int *                 IA,
+             const int *                 JA,
+             const int *                 DESCA,
+             const std::complex<double> *beta,
+             std::complex<double> *      C,
+             const int *                 IC,
+             const int *                 JC,
+             const int *                 DESCC);
+
+    /**
+     * Routine to transpose a matrix:
+     * C = beta C + alpha A^T
+     */
+    void
+    pdtran_(const int *   m,
+            const int *   n,
+            const double *alpha,
+            const double *A,
+            const int *   IA,
+            const int *   JA,
+            const int *   DESCA,
+            const double *beta,
+            double *      C,
+            const int *   IC,
+            const int *   JC,
+            const int *   DESCC);
+    void
+    pstran_(const int *  m,
+            const int *  n,
+            const float *alpha,
+            const float *A,
+            const int *  IA,
+            const int *  JA,
+            const int *  DESCA,
+            const float *beta,
+            float *      C,
+            const int *  IC,
+            const int *  JC,
+            const int *  DESCC);
+
+    /**
+     *  psyevr computes selected eigenvalues and, optionally, eigenvectors
+     *  of a real symmetric matrix A using a parallel implementation of the MRR
+     * algorithm. Eigenvalues/vectors can be selected by specifying a range of
+     * values or a range of indices for the desired eigenvalues.
+     */
+    void
+    pdsyevr_(const char *  jobz,
+             const char *  range,
+             const char *  uplo,
+             const int *   n,
+             double *      A,
+             const int *   IA,
+             const int *   JA,
+             const int *   DESCA,
+             const double *VL,
+             const double *VU,
+             const int *   IL,
+             const int *   IU,
+             int *         m,
+             int *         nz,
+             double *      w,
+             double *      Z,
+             const int *   IZ,
+             const int *   JZ,
+             const int *   DESCZ,
+             double *      work,
+             int *         lwork,
+             int *         iwork,
+             int *         liwork,
+             int *         info);
+    void
+    pssyevr_(const char * jobz,
+             const char * range,
+             const char * uplo,
+             const int *  n,
+             float *      A,
+             const int *  IA,
+             const int *  JA,
+             const int *  DESCA,
+             const float *VL,
+             const float *VU,
+             const int *  IL,
+             const int *  IU,
+             int *        m,
+             int *        nz,
+             float *      w,
+             float *      Z,
+             const int *  IZ,
+             const int *  JZ,
+             const int *  DESCZ,
+             float *      work,
+             int *        lwork,
+             int *        iwork,
+             int *        liwork,
+             int *        info);
+    void
+    pzheevr_(const char *          jobz,
+             const char *          range,
+             const char *          uplo,
+             const int *           n,
+             std::complex<double> *A,
+             const int *           IA,
+             const int *           JA,
+             const int *           DESCA,
+             const double *        VL,
+             const double *        VU,
+             const int *           IL,
+             const int *           IU,
+             int *                 m,
+             int *                 nz,
+             double *              w,
+             std::complex<double> *Z,
+             const int *           IZ,
+             const int *           JZ,
+             const int *           DESCZ,
+             std::complex<double> *work,
+             int *                 lwork,
+             int *                 iwork,
+             int *                 liwork,
+             int *                 info);
+  }
+
+
+
+  /*
+   * In the following we have template wrappers for the ScaLAPACK routines
+   * wrappers for other numeric types can be added in the future
+   */
+  template <typename number>
+  inline void
+  Cgerv2d(int /*context*/,
+          int /*M*/,
+          int /*N*/,
+          number * /*A*/,
+          int /*lda*/,
+          int /*rsrc*/,
+          int /*csrc*/)
+  {
+    AssertThrow(false, dealii::ExcNotImplemented());
+  }
+
+  inline void
+  Cgerv2d(int context, int M, int N, double *A, int lda, int rsrc, int csrc)
+  {
+    Cdgerv2d(context, M, N, A, lda, rsrc, csrc);
+  }
+
+  inline void
+  Cgerv2d(int context, int M, int N, float *A, int lda, int rsrc, int csrc)
+  {
+    Csgerv2d(context, M, N, A, lda, rsrc, csrc);
+  }
+
+
+  template <typename number>
+  inline void
+  Cgesd2d(int /*context*/,
+          int /*M*/,
+          int /*N*/,
+          number * /*A*/,
+          int /*lda*/,
+          int /*rdest*/,
+          int /*cdest*/)
+  {
+    AssertThrow(false, dealii::ExcNotImplemented());
+  }
+
+  inline void
+  Cgesd2d(int context, int M, int N, double *A, int lda, int rdest, int cdest)
+  {
+    Cdgesd2d(context, M, N, A, lda, rdest, cdest);
+  }
+
+  inline void
+  Cgesd2d(int context, int M, int N, float *A, int lda, int rdest, int cdest)
+  {
+    Csgesd2d(context, M, N, A, lda, rdest, cdest);
+  }
+
+  inline void
+  pplacgv(const int *N,
+          double *   A,
+          const int *IA,
+          const int *JA,
+          const int *DESCA,
+          const int *INCX)
+  {}
+
+  inline void
+  pplacgv(const int *           N,
+          std::complex<double> *A,
+          const int *           IA,
+          const int *           JA,
+          const int *           DESCA,
+          const int *           INCX)
+  {
+    pzlacgv_(N, A, IA, JA, DESCA, INCX);
+  }
+
+  template <typename number>
+  inline void
+  ppotrf(const char * /*UPLO*/,
+         const int * /*N*/,
+         number * /*A*/,
+         const int * /*IA*/,
+         const int * /*JA*/,
+         const int * /*DESCA*/,
+         int * /*INFO*/)
+  {
+    AssertThrow(false, dealii::ExcNotImplemented());
+  }
+
+  inline void
+  ppotrf(const char *UPLO,
+         const int * N,
+         double *    A,
+         const int * IA,
+         const int * JA,
+         const int * DESCA,
+         int *       INFO)
+  {
+    pdpotrf_(UPLO, N, A, IA, JA, DESCA, INFO);
+  }
+
+  inline void
+  ppotrf(const char *UPLO,
+         const int * N,
+         float *     A,
+         const int * IA,
+         const int * JA,
+         const int * DESCA,
+         int *       INFO)
+  {
+    pspotrf_(UPLO, N, A, IA, JA, DESCA, INFO);
+  }
+
+  inline void
+  ppotrf(const char *          UPLO,
+         const int *           N,
+         std::complex<double> *A,
+         const int *           IA,
+         const int *           JA,
+         const int *           DESCA,
+         int *                 INFO)
+  {
+    pzpotrf_(UPLO, N, A, IA, JA, DESCA, INFO);
+  }
+
+  template <typename number>
+  inline void
+  pgetrf(const int * /*m*/,
+         const int * /*n*/,
+         number * /*A*/,
+         const int * /*IA*/,
+         const int * /*JA*/,
+         const int * /*DESCA*/,
+         int * /*ipiv*/,
+         int * /*INFO*/)
+  {
+    AssertThrow(false, dealii::ExcNotImplemented());
+  }
+
+  inline void
+  pgetrf(const int *m,
+         const int *n,
+         double *   A,
+         const int *IA,
+         const int *JA,
+         const int *DESCA,
+         int *      ipiv,
+         int *      INFO)
+  {
+    pdgetrf_(m, n, A, IA, JA, DESCA, ipiv, INFO);
+  }
+
+  inline void
+  pgetrf(const int *m,
+         const int *n,
+         float *    A,
+         const int *IA,
+         const int *JA,
+         const int *DESCA,
+         int *      ipiv,
+         int *      INFO)
+  {
+    psgetrf_(m, n, A, IA, JA, DESCA, ipiv, INFO);
+  }
+
+  inline void
+  pgetrf(const int *           m,
+         const int *           n,
+         std::complex<double> *A,
+         const int *           IA,
+         const int *           JA,
+         const int *           DESCA,
+         int *                 ipiv,
+         int *                 INFO)
+  {
+    pzgetrf_(m, n, A, IA, JA, DESCA, ipiv, INFO);
+  }
+
+  template <typename number>
+  inline void
+  ppotri(const char * /*UPLO*/,
+         const int * /*N*/,
+         number * /*A*/,
+         const int * /*IA*/,
+         const int * /*JA*/,
+         const int * /*DESCA*/,
+         int * /*INFO*/)
+  {
+    AssertThrow(false, dealii::ExcNotImplemented());
+  }
+
+  inline void
+  ppotri(const char *UPLO,
+         const int * N,
+         double *    A,
+         const int * IA,
+         const int * JA,
+         const int * DESCA,
+         int *       INFO)
+  {
+    pdpotri_(UPLO, N, A, IA, JA, DESCA, INFO);
+  }
+
+  inline void
+  ppotri(const char *UPLO,
+         const int * N,
+         float *     A,
+         const int * IA,
+         const int * JA,
+         const int * DESCA,
+         int *       INFO)
+  {
+    pspotri_(UPLO, N, A, IA, JA, DESCA, INFO);
+  }
+
+  inline void
+  ppotri(const char *          UPLO,
+         const int *           N,
+         std::complex<double> *A,
+         const int *           IA,
+         const int *           JA,
+         const int *           DESCA,
+         int *                 INFO)
+  {
+    pzpotri_(UPLO, N, A, IA, JA, DESCA, INFO);
+  }
+
+  template <typename number>
+  inline void
+  pgetri(const int * /*N*/,
+         number * /*A*/,
+         const int * /*IA*/,
+         const int * /*JA*/,
+         const int * /*DESCA*/,
+         const int * /*ipiv*/,
+         number * /*work*/,
+         int * /*lwork*/,
+         int * /*iwork*/,
+         int * /*liwork*/,
+         int * /*info*/)
+  {
+    AssertThrow(false, dealii::ExcNotImplemented());
+  }
+
+  inline void
+  pgetri(const int *N,
+         double *   A,
+         const int *IA,
+         const int *JA,
+         const int *DESCA,
+         const int *ipiv,
+         double *   work,
+         int *      lwork,
+         int *      iwork,
+         int *      liwork,
+         int *      info)
+  {
+    pdgetri_(N, A, IA, JA, DESCA, ipiv, work, lwork, iwork, liwork, info);
+  }
+
+  inline void
+  pgetri(const int *N,
+         float *    A,
+         const int *IA,
+         const int *JA,
+         const int *DESCA,
+         const int *ipiv,
+         float *    work,
+         int *      lwork,
+         int *      iwork,
+         int *      liwork,
+         int *      info)
+  {
+    psgetri_(N, A, IA, JA, DESCA, ipiv, work, lwork, iwork, liwork, info);
+  }
+
+  inline void
+  pgetri(const int *           N,
+         std::complex<double> *A,
+         const int *           IA,
+         const int *           JA,
+         const int *           DESCA,
+         const int *           ipiv,
+         std::complex<double> *work,
+         int *                 lwork,
+         int *                 iwork,
+         int *                 liwork,
+         int *                 info)
+  {
+    pzgetri_(N, A, IA, JA, DESCA, ipiv, work, lwork, iwork, liwork, info);
+  }
+
+
+
+  template <typename number>
+  inline void
+  ptrtri(const char * /*UPLO*/,
+         const char * /*DIAG*/,
+         const int * /*N*/,
+         number * /*A*/,
+         const int * /*IA*/,
+         const int * /*JA*/,
+         const int * /*DESCA*/,
+         int * /*INFO*/)
+  {
+    AssertThrow(false, dealii::ExcNotImplemented());
+  }
+
+  inline void
+  ptrtri(const char *UPLO,
+         const char *DIAG,
+         const int * N,
+         double *    A,
+         const int * IA,
+         const int * JA,
+         const int * DESCA,
+         int *       INFO)
+  {
+    pdtrtri_(UPLO, DIAG, N, A, IA, JA, DESCA, INFO);
+  }
+
+  inline void
+  ptrtri(const char *UPLO,
+         const char *DIAG,
+         const int * N,
+         float *     A,
+         const int * IA,
+         const int * JA,
+         const int * DESCA,
+         int *       INFO)
+  {
+    pstrtri_(UPLO, DIAG, N, A, IA, JA, DESCA, INFO);
+  }
+
+  inline void
+  ptrtri(const char *          UPLO,
+         const char *          DIAG,
+         const int *           N,
+         std::complex<double> *A,
+         const int *           IA,
+         const int *           JA,
+         const int *           DESCA,
+         int *                 INFO)
+  {
+    pztrtri_(UPLO, DIAG, N, A, IA, JA, DESCA, INFO);
+  }
+
+  template <typename number>
+  inline void
+  ppocon(const char * /*uplo*/,
+         const int * /*N*/,
+         const number * /*A*/,
+         const int * /*IA*/,
+         const int * /*JA*/,
+         const int * /*DESCA*/,
+         const number * /*ANORM*/,
+         number * /*RCOND*/,
+         number * /*WORK*/,
+         const int * /*LWORK*/,
+         int * /*IWORK*/,
+         const int * /*LIWORK*/,
+         int * /*INFO*/)
+  {
+    AssertThrow(false, dealii::ExcNotImplemented());
+  }
+
+  inline void
+  ppocon(const char *  uplo,
+         const int *   N,
+         const double *A,
+         const int *   IA,
+         const int *   JA,
+         const int *   DESCA,
+         const double *ANORM,
+         double *      RCOND,
+         double *      WORK,
+         const int *   LWORK,
+         int *         IWORK,
+         const int *   LIWORK,
+         int *         INFO)
+  {
+    pdpocon_(uplo,
+             N,
+             A,
+             IA,
+             JA,
+             DESCA,
+             ANORM,
+             RCOND,
+             WORK,
+             LWORK,
+             IWORK,
+             LIWORK,
+             INFO);
+  }
+
+  inline void
+  ppocon(const char * uplo,
+         const int *  N,
+         const float *A,
+         const int *  IA,
+         const int *  JA,
+         const int *  DESCA,
+         const float *ANORM,
+         float *      RCOND,
+         float *      WORK,
+         const int *  LWORK,
+         int *        IWORK,
+         const int *  LIWORK,
+         int *        INFO)
+  {
+    pspocon_(uplo,
+             N,
+             A,
+             IA,
+             JA,
+             DESCA,
+             ANORM,
+             RCOND,
+             WORK,
+             LWORK,
+             IWORK,
+             LIWORK,
+             INFO);
+  }
+
+
+  template <typename number>
+  inline number
+  plansy(const char * /*norm*/,
+         const char * /*uplo*/,
+         const int * /*N*/,
+         const number * /*A*/,
+         const int * /*IA*/,
+         const int * /*JA*/,
+         const int * /*DESCA*/,
+         number * /*work*/)
+  {
+    AssertThrow(false, dealii::ExcNotImplemented());
+  }
+
+  inline double
+  plansy(const char *  norm,
+         const char *  uplo,
+         const int *   N,
+         const double *A,
+         const int *   IA,
+         const int *   JA,
+         const int *   DESCA,
+         double *      work)
+  {
+    return pdlansy_(norm, uplo, N, A, IA, JA, DESCA, work);
+  }
+
+  inline float
+  plansy(const char * norm,
+         const char * uplo,
+         const int *  N,
+         const float *A,
+         const int *  IA,
+         const int *  JA,
+         const int *  DESCA,
+         float *      work)
+  {
+    return pslansy_(norm, uplo, N, A, IA, JA, DESCA, work);
+  }
+
+
+  template <typename number>
+  inline void
+  pgesv(const int * /*n*/,
+        const int * /*nrhs*/,
+        number * /*A*/,
+        const int * /*ia*/,
+        const int * /*ja*/,
+        const int * /*desca*/,
+        int * /*ipiv*/,
+        number * /*B*/,
+        const int * /*ib*/,
+        const int * /*jb*/,
+        const int * /*descb*/,
+        int * /*info*/)
+  {
+    AssertThrow(false, dealii::ExcNotImplemented());
+  }
+
+  inline void
+  pgesv(const int *n,
+        const int *nrhs,
+        double *   A,
+        const int *ia,
+        const int *ja,
+        const int *desca,
+        int *      ipiv,
+        double *   B,
+        const int *ib,
+        const int *jb,
+        const int *descb,
+        int *      info)
+  {
+    pdgesv_(n, nrhs, A, ia, ja, desca, ipiv, B, ib, jb, descb, info);
+  }
+
+  inline void
+  pgesv(const int *n,
+        const int *nrhs,
+        float *    A,
+        const int *ia,
+        const int *ja,
+        const int *desca,
+        int *      ipiv,
+        float *    B,
+        const int *ib,
+        const int *jb,
+        const int *descb,
+        int *      info)
+  {
+    psgesv_(n, nrhs, A, ia, ja, desca, ipiv, B, ib, jb, descb, info);
+  }
+
+
+  template <typename number>
+  inline void
+  pgemm(const char * /*transa*/,
+        const char * /*transb*/,
+        const int * /*m*/,
+        const int * /*n*/,
+        const int * /*k*/,
+        const number * /*alpha*/,
+        const number * /*A*/,
+        const int * /*IA*/,
+        const int * /*JA*/,
+        const int * /*DESCA*/,
+        const number * /*B*/,
+        const int * /*IB*/,
+        const int * /*JB*/,
+        const int * /*DESCB*/,
+        const number * /*beta*/,
+        number * /*C*/,
+        const int * /*IC*/,
+        const int * /*JC*/,
+        const int * /*DESCC*/)
+  {
+    AssertThrow(false, dealii::ExcNotImplemented());
+  }
+
+  inline void
+  pgemm(const char *  transa,
+        const char *  transb,
+        const int *   m,
+        const int *   n,
+        const int *   k,
+        const double *alpha,
+        const double *A,
+        const int *   IA,
+        const int *   JA,
+        const int *   DESCA,
+        const double *B,
+        const int *   IB,
+        const int *   JB,
+        const int *   DESCB,
+        const double *beta,
+        double *      C,
+        const int *   IC,
+        const int *   JC,
+        const int *   DESCC)
+  {
+    pdgemm_(transa,
+            transb,
+            m,
+            n,
+            k,
+            alpha,
+            A,
+            IA,
+            JA,
+            DESCA,
+            B,
+            IB,
+            JB,
+            DESCB,
+            beta,
+            C,
+            IC,
+            JC,
+            DESCC);
+  }
+
+  inline void
+  pgemm(const char * transa,
+        const char * transb,
+        const int *  m,
+        const int *  n,
+        const int *  k,
+        const float *alpha,
+        const float *A,
+        const int *  IA,
+        const int *  JA,
+        const int *  DESCA,
+        const float *B,
+        const int *  IB,
+        const int *  JB,
+        const int *  DESCB,
+        const float *beta,
+        float *      C,
+        const int *  IC,
+        const int *  JC,
+        const int *  DESCC)
+  {
+    psgemm_(transa,
+            transb,
+            m,
+            n,
+            k,
+            alpha,
+            A,
+            IA,
+            JA,
+            DESCA,
+            B,
+            IB,
+            JB,
+            DESCB,
+            beta,
+            C,
+            IC,
+            JC,
+            DESCC);
+  }
+
+
+  inline void
+  pgemm(const char *                transa,
+        const char *                transb,
+        const int *                 m,
+        const int *                 n,
+        const int *                 k,
+        const std::complex<double> *alpha,
+        const std::complex<double> *A,
+        const int *                 IA,
+        const int *                 JA,
+        const int *                 DESCA,
+        const std::complex<double> *B,
+        const int *                 IB,
+        const int *                 JB,
+        const int *                 DESCB,
+        const std::complex<double> *beta,
+        std::complex<double> *      C,
+        const int *                 IC,
+        const int *                 JC,
+        const int *                 DESCC)
+  {
+    pzgemm_(transa,
+            transb,
+            m,
+            n,
+            k,
+            alpha,
+            A,
+            IA,
+            JA,
+            DESCA,
+            B,
+            IB,
+            JB,
+            DESCB,
+            beta,
+            C,
+            IC,
+            JC,
+            DESCC);
+  }
+
+  template <typename number>
+  inline number
+  plange(const char * /*norm*/,
+         const int * /*m*/,
+         const int * /*n*/,
+         const number * /*A*/,
+         const int * /*ia*/,
+         const int * /*ja*/,
+         const int * /*desca*/,
+         number * /*work*/)
+  {
+    AssertThrow(false, dealii::ExcNotImplemented());
+  }
+
+  inline double
+  plange(const char *  norm,
+         const int *   m,
+         const int *   n,
+         const double *A,
+         const int *   ia,
+         const int *   ja,
+         const int *   desca,
+         double *      work)
+  {
+    return pdlange_(norm, m, n, A, ia, ja, desca, work);
+  }
+
+  inline float
+  plange(const char * norm,
+         const int *  m,
+         const int *  n,
+         const float *A,
+         const int *  ia,
+         const int *  ja,
+         const int *  desca,
+         float *      work)
+  {
+    return pslange_(norm, m, n, A, ia, ja, desca, work);
+  }
+
+
+  template <typename number>
+  inline void
+  psyev(const char * /*jobz*/,
+        const char * /*uplo*/,
+        const int * /*m*/,
+        number * /*A*/,
+        const int * /*ia*/,
+        const int * /*ja*/,
+        int * /*desca*/,
+        number * /*w*/,
+        number * /*z*/,
+        const int * /*iz*/,
+        const int * /*jz*/,
+        int * /*descz*/,
+        number * /*work*/,
+        const int * /*lwork*/,
+        int * /*info*/)
+  {
+    AssertThrow(false, dealii::ExcNotImplemented());
+  }
+
+  inline void
+  psyev(const char *jobz,
+        const char *uplo,
+        const int * m,
+        double *    A,
+        const int * ia,
+        const int * ja,
+        int *       desca,
+        double *    w,
+        double *    z,
+        const int * iz,
+        const int * jz,
+        int *       descz,
+        double *    work,
+        const int * lwork,
+        int *       info)
+  {
+    pdsyev_(
+      jobz, uplo, m, A, ia, ja, desca, w, z, iz, jz, descz, work, lwork, info);
+  }
+
+  inline void
+  psyev(const char *jobz,
+        const char *uplo,
+        const int * m,
+        float *     A,
+        const int * ia,
+        const int * ja,
+        int *       desca,
+        float *     w,
+        float *     z,
+        const int * iz,
+        const int * jz,
+        int *       descz,
+        float *     work,
+        const int * lwork,
+        int *       info)
+  {
+    pssyev_(
+      jobz, uplo, m, A, ia, ja, desca, w, z, iz, jz, descz, work, lwork, info);
+  }
+
+  inline void
+  psyev(const char *          jobz,
+        const char *          uplo,
+        const int *           m,
+        std::complex<double> *A,
+        const int *           ia,
+        const int *           ja,
+        int *                 desca,
+        double *              w,
+        std::complex<double> *z,
+        const int *           iz,
+        const int *           jz,
+        int *                 descz,
+        std::complex<double> *work,
+        const int *           lwork,
+        int *                 info)
+  {
+    pzheev_(
+      jobz, uplo, m, A, ia, ja, desca, w, z, iz, jz, descz, work, lwork, info);
+  }
+
+  template <typename number>
+  inline void
+  placpy(const char * /*uplo*/,
+         const int * /*m*/,
+         const int * /*n*/,
+         const number * /*A*/,
+         const int * /*ia*/,
+         const int * /*ja*/,
+         const int * /*desca*/,
+         number * /*B*/,
+         const int * /*ib*/,
+         const int * /*jb*/,
+         const int * /*descb*/)
+  {
+    AssertThrow(false, dealii::ExcNotImplemented());
+  }
+
+  inline void
+  placpy(const char *  uplo,
+         const int *   m,
+         const int *   n,
+         const double *A,
+         const int *   ia,
+         const int *   ja,
+         const int *   desca,
+         double *      B,
+         const int *   ib,
+         const int *   jb,
+         const int *   descb)
+  {
+    pdlacpy_(uplo, m, n, A, ia, ja, desca, B, ib, jb, descb);
+  }
+
+  inline void
+  placpy(const char * uplo,
+         const int *  m,
+         const int *  n,
+         const float *A,
+         const int *  ia,
+         const int *  ja,
+         const int *  desca,
+         float *      B,
+         const int *  ib,
+         const int *  jb,
+         const int *  descb)
+  {
+    pslacpy_(uplo, m, n, A, ia, ja, desca, B, ib, jb, descb);
+  }
+
+
+  template <typename number>
+  inline void
+  pgemr2d(const int * /*m*/,
+          const int * /*n*/,
+          const number * /*A*/,
+          const int * /*ia*/,
+          const int * /*ja*/,
+          const int * /*desca*/,
+          number * /*B*/,
+          const int * /*ib*/,
+          const int * /*jb*/,
+          const int * /*descb*/,
+          const int * /*ictxt*/)
+  {
+    AssertThrow(false, dealii::ExcNotImplemented());
+  }
+
+  inline void
+  pgemr2d(const int *   m,
+          const int *   n,
+          const double *A,
+          const int *   ia,
+          const int *   ja,
+          const int *   desca,
+          double *      B,
+          const int *   ib,
+          const int *   jb,
+          const int *   descb,
+          const int *   ictxt)
+  {
+    pdgemr2d_(m, n, A, ia, ja, desca, B, ib, jb, descb, ictxt);
+  }
+
+  inline void
+  pgemr2d(const int *  m,
+          const int *  n,
+          const float *A,
+          const int *  ia,
+          const int *  ja,
+          const int *  desca,
+          float *      B,
+          const int *  ib,
+          const int *  jb,
+          const int *  descb,
+          const int *  ictxt)
+  {
+    psgemr2d_(m, n, A, ia, ja, desca, B, ib, jb, descb, ictxt);
+  }
+
+
+  template <typename number>
+  inline void
+  plamch(const int * /*ictxt*/, const char * /*cmach*/, number & /*val*/)
+  {
+    AssertThrow(false, dealii::ExcNotImplemented());
+  }
+
+  inline void
+  plamch(const int *ictxt, const char *cmach, double &val)
+  {
+    val = pdlamch_(ictxt, cmach);
+  }
+
+  inline void
+  plamch(const int *ictxt, const char *cmach, float &val)
+  {
+    val = pslamch_(ictxt, cmach);
+  }
+
+
+  template <typename number>
+  inline void
+  psyevx(const char * /*jobz*/,
+         const char * /*range*/,
+         const char * /*uplo*/,
+         const int * /*n*/,
+         number * /*A*/,
+         const int * /*ia*/,
+         const int * /*ja*/,
+         const int * /*desca*/,
+         number * /*VL*/,
+         number * /*VU*/,
+         const int * /*il*/,
+         const int * /*iu*/,
+         number * /*abstol*/,
+         const int * /*m*/,
+         const int * /*nz*/,
+         number * /*w*/,
+         number * /*orfac*/,
+         number * /*Z*/,
+         const int * /*iz*/,
+         const int * /*jz*/,
+         const int * /*descz*/,
+         number * /*work*/,
+         int * /*lwork*/,
+         int * /*iwork*/,
+         int * /*liwork*/,
+         int * /*ifail*/,
+         int * /*iclustr*/,
+         number * /*gap*/,
+         int * /*info*/)
+  {
+    AssertThrow(false, dealii::ExcNotImplemented());
+  }
+
+  inline void
+  psyevx(const char *jobz,
+         const char *range,
+         const char *uplo,
+         const int * n,
+         double *    A,
+         const int * ia,
+         const int * ja,
+         const int * desca,
+         double *    VL,
+         double *    VU,
+         const int * il,
+         const int * iu,
+         double *    abstol,
+         const int * m,
+         const int * nz,
+         double *    w,
+         double *    orfac,
+         double *    Z,
+         const int * iz,
+         const int * jz,
+         const int * descz,
+         double *    work,
+         int *       lwork,
+         int *       iwork,
+         int *       liwork,
+         int *       ifail,
+         int *       iclustr,
+         double *    gap,
+         int *       info)
+  {
+    pdsyevx_(jobz,
+             range,
+             uplo,
+             n,
+             A,
+             ia,
+             ja,
+             desca,
+             VL,
+             VU,
+             il,
+             iu,
+             abstol,
+             m,
+             nz,
+             w,
+             orfac,
+             Z,
+             iz,
+             jz,
+             descz,
+             work,
+             lwork,
+             iwork,
+             liwork,
+             ifail,
+             iclustr,
+             gap,
+             info);
+  }
+
+  inline void
+  psyevx(const char *jobz,
+         const char *range,
+         const char *uplo,
+         const int * n,
+         float *     A,
+         const int * ia,
+         const int * ja,
+         const int * desca,
+         float *     VL,
+         float *     VU,
+         const int * il,
+         const int * iu,
+         float *     abstol,
+         const int * m,
+         const int * nz,
+         float *     w,
+         float *     orfac,
+         float *     Z,
+         const int * iz,
+         const int * jz,
+         const int * descz,
+         float *     work,
+         int *       lwork,
+         int *       iwork,
+         int *       liwork,
+         int *       ifail,
+         int *       iclustr,
+         float *     gap,
+         int *       info)
+  {
+    pssyevx_(jobz,
+             range,
+             uplo,
+             n,
+             A,
+             ia,
+             ja,
+             desca,
+             VL,
+             VU,
+             il,
+             iu,
+             abstol,
+             m,
+             nz,
+             w,
+             orfac,
+             Z,
+             iz,
+             jz,
+             descz,
+             work,
+             lwork,
+             iwork,
+             liwork,
+             ifail,
+             iclustr,
+             gap,
+             info);
+  }
+
+  inline void
+  psyevx(const char *          jobz,
+         const char *          range,
+         const char *          uplo,
+         const int *           n,
+         std::complex<double> *A,
+         const int *           ia,
+         const int *           ja,
+         const int *           desca,
+         double *              VL,
+         double *              VU,
+         const int *           il,
+         const int *           iu,
+         double *              abstol,
+         const int *           m,
+         const int *           nz,
+         double *              w,
+         double *              orfac,
+         std::complex<double> *Z,
+         const int *           iz,
+         const int *           jz,
+         const int *           descz,
+         std::complex<double> *work,
+         int *                 lwork,
+         int *                 iwork,
+         int *                 liwork,
+         int *                 ifail,
+         int *                 iclustr,
+         double *              gap,
+         int *                 info)
+  {
+    pzheevx_(jobz,
+             range,
+             uplo,
+             n,
+             A,
+             ia,
+             ja,
+             desca,
+             VL,
+             VU,
+             il,
+             iu,
+             abstol,
+             m,
+             nz,
+             w,
+             orfac,
+             Z,
+             iz,
+             jz,
+             descz,
+             work,
+             lwork,
+             iwork,
+             liwork,
+             ifail,
+             iclustr,
+             gap,
+             info);
+  }
+
+  template <typename number>
+  inline void
+  pgesvd(const char * /*jobu*/,
+         const char * /*jobvt*/,
+         const int * /*m*/,
+         const int * /*n*/,
+         number * /*A*/,
+         const int * /*ia*/,
+         const int * /*ja*/,
+         const int * /*desca*/,
+         number * /*S*/,
+         number * /*U*/,
+         const int * /*iu*/,
+         const int * /*ju*/,
+         const int * /*descu*/,
+         number * /*VT*/,
+         const int * /*ivt*/,
+         const int * /*jvt*/,
+         const int * /*descvt*/,
+         number * /*work*/,
+         int * /*lwork*/,
+         int * /*info*/)
+  {
+    AssertThrow(false, dealii::ExcNotImplemented());
+  }
+
+  inline void
+  pgesvd(const char *jobu,
+         const char *jobvt,
+         const int * m,
+         const int * n,
+         double *    A,
+         const int * ia,
+         const int * ja,
+         const int * desca,
+         double *    S,
+         double *    U,
+         const int * iu,
+         const int * ju,
+         const int * descu,
+         double *    VT,
+         const int * ivt,
+         const int * jvt,
+         const int * descvt,
+         double *    work,
+         int *       lwork,
+         int *       info)
+  {
+    pdgesvd_(jobu,
+             jobvt,
+             m,
+             n,
+             A,
+             ia,
+             ja,
+             desca,
+             S,
+             U,
+             iu,
+             ju,
+             descu,
+             VT,
+             ivt,
+             jvt,
+             descvt,
+             work,
+             lwork,
+             info);
+  }
+
+  inline void
+  pgesvd(const char *jobu,
+         const char *jobvt,
+         const int * m,
+         const int * n,
+         float *     A,
+         const int * ia,
+         const int * ja,
+         const int * desca,
+         float *     S,
+         float *     U,
+         const int * iu,
+         const int * ju,
+         const int * descu,
+         float *     VT,
+         const int * ivt,
+         const int * jvt,
+         const int * descvt,
+         float *     work,
+         int *       lwork,
+         int *       info)
+  {
+    psgesvd_(jobu,
+             jobvt,
+             m,
+             n,
+             A,
+             ia,
+             ja,
+             desca,
+             S,
+             U,
+             iu,
+             ju,
+             descu,
+             VT,
+             ivt,
+             jvt,
+             descvt,
+             work,
+             lwork,
+             info);
+  }
+
+
+  template <typename number>
+  inline void
+  pgels(const char * /*trans*/,
+        const int * /*m*/,
+        const int * /*n*/,
+        const int * /*nrhs*/,
+        number * /*A*/,
+        const int * /*ia*/,
+        const int * /*ja*/,
+        const int * /*desca*/,
+        number * /*B*/,
+        const int * /*ib*/,
+        const int * /*jb*/,
+        const int * /*descb*/,
+        number * /*work*/,
+        int * /*lwork*/,
+        int * /*info*/)
+  {
+    AssertThrow(false, dealii::ExcNotImplemented());
+  }
+
+  inline void
+  pgels(const char *trans,
+        const int * m,
+        const int * n,
+        const int * nrhs,
+        double *    A,
+        const int * ia,
+        const int * ja,
+        const int * desca,
+        double *    B,
+        const int * ib,
+        const int * jb,
+        const int * descb,
+        double *    work,
+        int *       lwork,
+        int *       info)
+  {
+    pdgels_(
+      trans, m, n, nrhs, A, ia, ja, desca, B, ib, jb, descb, work, lwork, info);
+  }
+
+  inline void
+  pgels(const char *trans,
+        const int * m,
+        const int * n,
+        const int * nrhs,
+        float *     A,
+        const int * ia,
+        const int * ja,
+        const int * desca,
+        float *     B,
+        const int * ib,
+        const int * jb,
+        const int * descb,
+        float *     work,
+        int *       lwork,
+        int *       info)
+  {
+    psgels_(
+      trans, m, n, nrhs, A, ia, ja, desca, B, ib, jb, descb, work, lwork, info);
+  }
+
+
+  template <typename number>
+  inline void
+  pgeadd(const char * /*transa*/,
+         const int * /*m*/,
+         const int * /*n*/,
+         const number * /*alpha*/,
+         const number * /*A*/,
+         const int * /*IA*/,
+         const int * /*JA*/,
+         const int * /*DESCA*/,
+         const number * /*beta*/,
+         number * /*C*/,
+         const int * /*IC*/,
+         const int * /*JC*/,
+         const int * /*DESCC*/)
+  {
+    AssertThrow(false, dealii::ExcNotImplemented());
+  }
+
+  inline void
+  pgeadd(const char *  transa,
+         const int *   m,
+         const int *   n,
+         const double *alpha,
+         const double *A,
+         const int *   IA,
+         const int *   JA,
+         const int *   DESCA,
+         const double *beta,
+         double *      C,
+         const int *   IC,
+         const int *   JC,
+         const int *   DESCC)
+  {
+    pdgeadd_(transa, m, n, alpha, A, IA, JA, DESCA, beta, C, IC, JC, DESCC);
+  }
+
+  inline void
+  pgeadd(const char * transa,
+         const int *  m,
+         const int *  n,
+         const float *alpha,
+         const float *A,
+         const int *  IA,
+         const int *  JA,
+         const int *  DESCA,
+         const float *beta,
+         float *      C,
+         const int *  IC,
+         const int *  JC,
+         const int *  DESCC)
+  {
+    psgeadd_(transa, m, n, alpha, A, IA, JA, DESCA, beta, C, IC, JC, DESCC);
+  }
+
+  inline void
+  pgeadd(const char *                transa,
+         const int *                 m,
+         const int *                 n,
+         const std::complex<double> *alpha,
+         const std::complex<double> *A,
+         const int *                 IA,
+         const int *                 JA,
+         const int *                 DESCA,
+         const std::complex<double> *beta,
+         std::complex<double> *      C,
+         const int *                 IC,
+         const int *                 JC,
+         const int *                 DESCC)
+  {
+    pzgeadd_(transa, m, n, alpha, A, IA, JA, DESCA, beta, C, IC, JC, DESCC);
+  }
+
+  template <typename number>
+  inline void
+  ptran(const int * /*m*/,
+        const int * /*n*/,
+        const number * /*alpha*/,
+        const number * /*A*/,
+        const int * /*IA*/,
+        const int * /*JA*/,
+        const int * /*DESCA*/,
+        const number * /*beta*/,
+        number * /*C*/,
+        const int * /*IC*/,
+        const int * /*JC*/,
+        const int * /*DESCC*/)
+  {
+    AssertThrow(false, dealii::ExcNotImplemented());
+  }
+
+  inline void
+  ptran(const int *   m,
+        const int *   n,
+        const double *alpha,
+        const double *A,
+        const int *   IA,
+        const int *   JA,
+        const int *   DESCA,
+        const double *beta,
+        double *      C,
+        const int *   IC,
+        const int *   JC,
+        const int *   DESCC)
+  {
+    pdtran_(m, n, alpha, A, IA, JA, DESCA, beta, C, IC, JC, DESCC);
+  }
+
+  inline void
+  ptran(const int *  m,
+        const int *  n,
+        const float *alpha,
+        const float *A,
+        const int *  IA,
+        const int *  JA,
+        const int *  DESCA,
+        const float *beta,
+        float *      C,
+        const int *  IC,
+        const int *  JC,
+        const int *  DESCC)
+  {
+    pstran_(m, n, alpha, A, IA, JA, DESCA, beta, C, IC, JC, DESCC);
+  }
+
+
+  template <typename number>
+  inline void
+  psyevr(const char * /*jobz*/,
+         const char * /*range*/,
+         const char * /*uplo*/,
+         const int * /*n*/,
+         number * /*A*/,
+         const int * /*IA*/,
+         const int * /*JA*/,
+         const int * /*DESCA*/,
+         const number * /*VL*/,
+         const number * /*VU*/,
+         const int * /*IL*/,
+         const int * /*IU*/,
+         int * /*m*/,
+         int * /*nz*/,
+         number * /*w*/,
+         number * /*Z*/,
+         const int * /*IZ*/,
+         const int * /*JZ*/,
+         const int * /*DESCZ*/,
+         number * /*work*/,
+         int * /*lwork*/,
+         int * /*iwork*/,
+         int * /*liwork*/,
+         int * /*info*/)
+  {
+    AssertThrow(false, dealii::ExcNotImplemented());
+  }
+
+  inline void
+  psyevr(const char *  jobz,
+         const char *  range,
+         const char *  uplo,
+         const int *   n,
+         double *      A,
+         const int *   IA,
+         const int *   JA,
+         const int *   DESCA,
+         const double *VL,
+         const double *VU,
+         const int *   IL,
+         const int *   IU,
+         int *         m,
+         int *         nz,
+         double *      w,
+         double *      Z,
+         const int *   IZ,
+         const int *   JZ,
+         const int *   DESCZ,
+         double *      work,
+         int *         lwork,
+         int *         iwork,
+         int *         liwork,
+         int *         info)
+  {
+    /*
+     * Netlib ScaLAPACK performs floating point tests (e.g. divide-by-zero)
+     * within the call to pdsyevr causing floating point exceptions to be thrown
+     * (at least in debug mode). Therefore, we wrap the calls to pdsyevr into
+     * the following code to suppress the exception.
+     */
+#ifdef DEAL_II_HAVE_FP_EXCEPTIONS
+    fenv_t fp_exceptions;
+    feholdexcept(&fp_exceptions);
+#endif
+
+    pdsyevr_(jobz,
+             range,
+             uplo,
+             n,
+             A,
+             IA,
+             JA,
+             DESCA,
+             VL,
+             VU,
+             IL,
+             IU,
+             m,
+             nz,
+             w,
+             Z,
+             IZ,
+             JZ,
+             DESCZ,
+             work,
+             lwork,
+             iwork,
+             liwork,
+             info);
+
+#ifdef DEAL_II_HAVE_FP_EXCEPTIONS
+    fesetenv(&fp_exceptions);
+#endif
+  }
+
+  inline void
+  psyevr(const char * jobz,
+         const char * range,
+         const char * uplo,
+         const int *  n,
+         float *      A,
+         const int *  IA,
+         const int *  JA,
+         const int *  DESCA,
+         const float *VL,
+         const float *VU,
+         const int *  IL,
+         const int *  IU,
+         int *        m,
+         int *        nz,
+         float *      w,
+         float *      Z,
+         const int *  IZ,
+         const int *  JZ,
+         const int *  DESCZ,
+         float *      work,
+         int *        lwork,
+         int *        iwork,
+         int *        liwork,
+         int *        info)
+  {
+    /*
+     * Netlib ScaLAPACK performs floating point tests (e.g. divide-by-zero)
+     * within the call to pssyevr causing floating point exceptions to be thrown
+     * (at least in debug mode). Therefore, we wrap the calls to pssyevr into
+     * the following code to suppress the exception.
+     */
+#ifdef DEAL_II_HAVE_FP_EXCEPTIONS
+    fenv_t fp_exceptions;
+    feholdexcept(&fp_exceptions);
+#endif
+
+    pssyevr_(jobz,
+             range,
+             uplo,
+             n,
+             A,
+             IA,
+             JA,
+             DESCA,
+             VL,
+             VU,
+             IL,
+             IU,
+             m,
+             nz,
+             w,
+             Z,
+             IZ,
+             JZ,
+             DESCZ,
+             work,
+             lwork,
+             iwork,
+             liwork,
+             info);
+
+#ifdef DEAL_II_HAVE_FP_EXCEPTIONS
+    fesetenv(&fp_exceptions);
+#endif
+  }
+
+  inline void
+  psyevr(const char *          jobz,
+         const char *          range,
+         const char *          uplo,
+         const int *           n,
+         std::complex<double> *A,
+         const int *           IA,
+         const int *           JA,
+         const int *           DESCA,
+         const double *        VL,
+         const double *        VU,
+         const int *           IL,
+         const int *           IU,
+         int *                 m,
+         int *                 nz,
+         double *              w,
+         std::complex<double> *Z,
+         const int *           IZ,
+         const int *           JZ,
+         const int *           DESCZ,
+         std::complex<double> *work,
+         int *                 lwork,
+         int *                 iwork,
+         int *                 liwork,
+         int *                 info)
+  {
+    /*
+     * Netlib ScaLAPACK performs floating point tests (e.g. divide-by-zero)
+     * within the call to pdsyevr causing floating point exceptions to be thrown
+     * (at least in debug mode). Therefore, we wrap the calls to pdsyevr into
+     * the following code to suppress the exception.
+     */
+#ifdef DEAL_II_HAVE_FP_EXCEPTIONS
+    fenv_t fp_exceptions;
+    feholdexcept(&fp_exceptions);
+#endif
+
+    pzheevr_(jobz,
+             range,
+             uplo,
+             n,
+             A,
+             IA,
+             JA,
+             DESCA,
+             VL,
+             VU,
+             IL,
+             IU,
+             m,
+             nz,
+             w,
+             Z,
+             IZ,
+             JZ,
+             DESCZ,
+             work,
+             lwork,
+             iwork,
+             liwork,
+             info);
+
+#ifdef DEAL_II_HAVE_FP_EXCEPTIONS
+    fesetenv(&fp_exceptions);
+#endif
+  }
+
+  inline int
+  lworkFromWork(std::vector<double> &work)
+  {
+    return static_cast<int>(work[0]);
+  }
+
+  inline int
+  lworkFromWork(std::vector<float> &work)
+  {
+    return static_cast<int>(work[0]);
+  }
+
+  inline int
+  lworkFromWork(std::vector<std::complex<double>> &work)
+  {
+    return static_cast<int>(work[0].real());
+  }
+} // namespace dftefeScalapack
+#endif // dftfe_scalapack_templates_h
diff --git a/src/linearAlgebra/DO_NOT_USE_ScalapckWrapper/scalapackHeaders.h b/src/linearAlgebra/DO_NOT_USE_ScalapckWrapper/scalapackHeaders.h
new file mode 100644
index 000000000..a261533df
--- /dev/null
+++ b/src/linearAlgebra/DO_NOT_USE_ScalapckWrapper/scalapackHeaders.h
@@ -0,0 +1,119 @@
+//// ---------------------------------------------------------------------
+////
+//// Copyright (c) 2017-2022 The Regents of the University of Michigan and
+/// DFT-FE / authors.
+////
+//// This file is part of the DFT-FE code.
+////
+//// The DFT-FE code is free software; you can use it, redistribute
+//// it, and/or modify it under the terms of the GNU Lesser General
+//// Public License as published by the Free Software Foundation; either
+//// version 2.1 of the License, or (at your option) any later version.
+//// The full text of the license can be found in the file LICENSE at
+//// the top level of the DFT-FE distribution.
+////
+//// ---------------------------------------------------------------------
+////
+//// @author Shiva Rudraraju (2016), Phani Motamarri (2016), Sambit Das (2018)
+////
+//
+//#ifndef headers_H_
+//#define headers_H_
+//
+//#ifndef DOXYGEN_SHOULD_SKIP_THIS
+//// Include all deal.II header file
+//#  include <deal.II/base/conditional_ostream.h>
+//#  include <deal.II/base/function.h>
+//#  include <deal.II/base/logstream.h>
+//#  include <deal.II/base/point.h>
+//#  include <deal.II/base/quadrature.h>
+//#  include <deal.II/base/quadrature_point_data.h>
+//#  include <deal.II/base/table.h>
+//#  include <deal.II/base/tensor_function.h>
+//#  include <deal.II/base/timer.h>
+//#  include <deal.II/base/utilities.h>
+//
+//#  include <deal.II/distributed/grid_refinement.h>
+//#  include <deal.II/distributed/solution_transfer.h>
+//#  include <deal.II/distributed/tria.h>
+//
+//#  include <deal.II/dofs/dof_accessor.h>
+//#  include <deal.II/dofs/dof_handler.h>
+//#  include <deal.II/dofs/dof_renumbering.h>
+//#  include <deal.II/dofs/dof_tools.h>
+//
+//#  include <deal.II/fe/fe_q.h>
+//#  include <deal.II/fe/fe_system.h>
+//#  include <deal.II/fe/fe_values.h>
+//#  include <deal.II/fe/mapping_q1.h>
+//
+//#  include <deal.II/grid/grid_generator.h>
+//#  include <deal.II/grid/grid_in.h>
+//#  include <deal.II/grid/grid_out.h>
+//#  include <deal.II/grid/grid_refinement.h>
+//#  include <deal.II/grid/grid_tools.h>
+//#  include <deal.II/grid/tria.h>
+//#  include <deal.II/grid/tria_accessor.h>
+//#  include <deal.II/grid/tria_iterator.h>
+//
+//#  include <deal.II/lac/affine_constraints.h>
+//#  include <deal.II/lac/exceptions.h>
+//#  include <deal.II/lac/full_matrix.h>
+//#  include <deal.II/lac/la_parallel_vector.h>
+//#  include <deal.II/lac/lapack_full_matrix.h>
+//#  include <deal.II/lac/precondition.h>
+//#  include <deal.II/lac/solver_cg.h>
+//#  include <deal.II/lac/solver_gmres.h>
+//#  include <deal.II/lac/vector.h>
+//
+//#  include <deal.II/matrix_free/fe_evaluation.h>
+//#  include <deal.II/matrix_free/matrix_free.h>
+//
+//#  include <deal.II/numerics/data_out.h>
+//#  include <deal.II/numerics/error_estimator.h>
+//#  include <deal.II/numerics/matrix_tools.h>
+//#  include <deal.II/numerics/vector_tools.h>
+//#  ifdef USE_PETSC
+//#    include <deal.II/lac/slepc_solver.h>
+//#  endif
+//#  include <deal.II/base/config.h>
+//
+//#  include <deal.II/base/smartpointer.h>
+//#  include <deal.II/base/types.h>
+//
+//#  if defined(DFTFE_WITH_GPU)
+//#    include <cuComplex.h>
+//#    include <thrust/device_vector.h>
+//#    include <thrust/complex.h>
+//#  endif
+//
+//#  include "dftfeDataTypes.h"
+//#  include "distributedMulticomponentVec.h"
+//
+//// Include generic C++ headers
+//#  include <fstream>
+//#  include <iostream>
+//#  include <fenv.h>
+//
+//#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+//// commonly used  typedefs used in dftfe go here
+// namespace dftfe
+//{
+//  template <typename elem_type>
+//  using distributedCPUVec =
+//    dealii::LinearAlgebra::distributed::Vector<elem_type,
+//                                               dealii::MemorySpace::Host>;
+//#ifdef DFTFE_WITH_GPU
+//  // template <typename elem_type>
+//  // using distributedGPUVec =
+//  //  dealii::LinearAlgebra::distributed::Vector<elem_type,
+//  //                                             dealii::MemorySpace::CUDA>;
+//
+//  template <typename NumberType>
+//  using distributedGPUVec =
+//    dftfe::DistributedMulticomponentVec<NumberType, dftfe::MemorySpace::GPU>;
+//
+//#endif
+//} // namespace dftfe
+//
+//#endif
diff --git a/src/linearAlgebra/DO_NOT_USE_ScalapckWrapper/scalapackWrapper.cpp b/src/linearAlgebra/DO_NOT_USE_ScalapckWrapper/scalapackWrapper.cpp
new file mode 100644
index 000000000..4a349ecd2
--- /dev/null
+++ b/src/linearAlgebra/DO_NOT_USE_ScalapckWrapper/scalapackWrapper.cpp
@@ -0,0 +1,1651 @@
+// ---------------------------------------------------------------------
+//
+// Copyright (c) 2019-2020 The Regents of the University of Michigan and DFT-FE
+// authors.
+//
+// This file is part of the DFT-FE code.
+//
+// The DFT-FE code is free software; you can use it, redistribute
+// it, and/or modify it under the terms of the GNU Lesser General
+// Public License as published by the Free Software Foundation; either
+// version 2.1 of the License, or (at your option) any later version.
+// The full text of the license can be found in the file LICENSE at
+// the top level of the DFT-FE distribution.
+//
+// ---------------------------------------------------------------------
+//
+// @author Sambit Das
+//
+
+#include <deal.II/base/mpi_tags.h>
+#include <deal.II/base/mpi.h>
+#include "scalapackWrapper.h"
+#include "scalapack.templates.h"
+
+namespace dftefeScalapack
+{
+  template <typename NumberType>
+  ScaLAPACKMatrix<NumberType>::ScaLAPACKMatrix(
+    const size_type                                            n_rows_,
+    const size_type                                            n_columns_,
+    const std::shared_ptr<const dftefeScalapack::ProcessGrid> &process_grid,
+    const size_type                                            row_block_size_,
+    const size_type                                column_block_size_,
+    const dftefeScalapack::LAPACKSupport::Property property_)
+    : uplo('L')
+    , // for non-hermitian matrices this is not needed
+    first_process_row(0)
+    , first_process_column(0)
+    , submatrix_row(1)
+    , submatrix_column(1)
+  {
+    reinit(n_rows_,
+           n_columns_,
+           process_grid,
+           row_block_size_,
+           column_block_size_,
+           property_);
+  }
+
+
+
+  template <typename NumberType>
+  ScaLAPACKMatrix<NumberType>::ScaLAPACKMatrix(
+    const size_type                                            size,
+    const std::shared_ptr<const dftefeScalapack::ProcessGrid> &process_grid,
+    const size_type                                            block_size,
+    const dftefeScalapack::LAPACKSupport::Property             property)
+    : ScaLAPACKMatrix<NumberType>(size,
+                                  size,
+                                  process_grid,
+                                  block_size,
+                                  block_size,
+                                  property)
+  {}
+
+
+
+  template <typename NumberType>
+  void
+  ScaLAPACKMatrix<NumberType>::reinit(
+    const size_type                                            n_rows_,
+    const size_type                                            n_columns_,
+    const std::shared_ptr<const dftefeScalapack::ProcessGrid> &process_grid,
+    const size_type                                            row_block_size_,
+    const size_type                                column_block_size_,
+    const dftefeScalapack::LAPACKSupport::Property property_)
+  {
+    Assert(row_block_size_ > 0,
+           dealii::ExcMessage("Row block size has to be positive."));
+    Assert(column_block_size_ > 0,
+           dealii::ExcMessage("Column block size has to be positive."));
+    Assert(
+      row_block_size_ <= n_rows_,
+      dealii::ExcMessage(
+        "Row block size can not be greater than the number of rows of the matrix"));
+    Assert(
+      column_block_size_ <= n_columns_,
+      dealii::ExcMessage(
+        "Column block size can not be greater than the number of columns of the matrix"));
+
+    state             = dftefeScalapack::LAPACKSupport::State::matrix;
+    property          = property_;
+    grid              = process_grid;
+    n_rows            = n_rows_;
+    n_columns         = n_columns_;
+    row_block_size    = row_block_size_;
+    column_block_size = column_block_size_;
+
+    if (grid->mpi_process_is_active)
+      {
+        // Get local sizes:
+        n_local_rows    = numroc_(&n_rows,
+                               &row_block_size,
+                               &(grid->this_process_row),
+                               &first_process_row,
+                               &(grid->n_process_rows));
+        n_local_columns = numroc_(&n_columns,
+                                  &column_block_size,
+                                  &(grid->this_process_column),
+                                  &first_process_column,
+                                  &(grid->n_process_columns));
+
+        // LLD_A = MAX(1,NUMROC(M_A, MB_A, MYROW, RSRC_A, NPROW)), different
+        // between processes
+        int lda = std::max(1, n_local_rows);
+
+        int info = 0;
+        descinit_(descriptor,
+                  &n_rows,
+                  &n_columns,
+                  &row_block_size,
+                  &column_block_size,
+                  &first_process_row,
+                  &first_process_column,
+                  &(grid->blacs_context),
+                  &lda,
+                  &info);
+        AssertThrow(info == 0,
+                    dftefeScalapack::LAPACKSupport::ExcErrorCode("descinit_",
+                                                                 info));
+
+        values.clear();
+        values.resize(n_local_rows * n_local_columns, NumberType(0.0));
+        // this->TransposeTable<NumberType>::reinit(n_local_rows,
+        // n_local_columns);
+      }
+    else
+      {
+        // set process-local variables to something telling:
+        n_local_rows    = -1;
+        n_local_columns = -1;
+        std::fill(std::begin(descriptor), std::end(descriptor), -1);
+      }
+  }
+
+
+
+  template <typename NumberType>
+  void
+  ScaLAPACKMatrix<NumberType>::reinit(
+    const size_type                                            size,
+    const std::shared_ptr<const dftefeScalapack::ProcessGrid> &process_grid,
+    const size_type                                            block_size,
+    const dftefeScalapack::LAPACKSupport::Property             property)
+  {
+    reinit(size, size, process_grid, block_size, block_size, property);
+  }
+
+  template <typename NumberType>
+  void
+  ScaLAPACKMatrix<NumberType>::set_property(
+    const dftefeScalapack::LAPACKSupport::Property property_)
+  {
+    property = property_;
+  }
+
+
+
+  template <typename NumberType>
+  dftefeScalapack::LAPACKSupport::Property
+  ScaLAPACKMatrix<NumberType>::get_property() const
+  {
+    return property;
+  }
+
+
+  template <typename NumberType>
+  dftefeScalapack::LAPACKSupport::State
+  ScaLAPACKMatrix<NumberType>::get_state() const
+  {
+    return state;
+  }
+
+
+  template <typename NumberType>
+  unsigned int
+  ScaLAPACKMatrix<NumberType>::global_row(const unsigned int loc_row) const
+  {
+    Assert(n_local_rows >= 0 &&
+             loc_row < static_cast<unsigned int>(n_local_rows),
+           dealii::ExcIndexRange(loc_row, 0, n_local_rows));
+    const int i = loc_row + 1;
+    return indxl2g_(&i,
+                    &row_block_size,
+                    &(grid->this_process_row),
+                    &first_process_row,
+                    &(grid->n_process_rows)) -
+           1;
+  }
+
+
+
+  template <typename NumberType>
+  unsigned int
+  ScaLAPACKMatrix<NumberType>::global_column(
+    const unsigned int loc_column) const
+  {
+    Assert(n_local_columns >= 0 &&
+             loc_column < static_cast<unsigned int>(n_local_columns),
+           dealii::ExcIndexRange(loc_column, 0, n_local_columns));
+    const int j = loc_column + 1;
+    return indxl2g_(&j,
+                    &column_block_size,
+                    &(grid->this_process_column),
+                    &first_process_column,
+                    &(grid->n_process_columns)) -
+           1;
+  }
+
+  template <typename NumberType>
+  void
+  ScaLAPACKMatrix<NumberType>::conjugate()
+  {
+    if (std::is_same<NumberType, std::complex<double>>::value)
+      {
+        if (this->grid->mpi_process_is_active)
+          {
+            NumberType *A_loc =
+              (this->values.size() > 0) ? this->values.data() : nullptr;
+            const int totalSize = n_rows * n_columns;
+            const int incx      = 1;
+            pplacgv(&totalSize,
+                    A_loc,
+                    &submatrix_row,
+                    &submatrix_column,
+                    descriptor,
+                    &incx);
+          }
+      }
+  }
+
+
+  template <typename NumberType>
+  void
+  ScaLAPACKMatrix<NumberType>::copy_to(ScaLAPACKMatrix<NumberType> &dest) const
+  {
+    Assert(n_rows == dest.n_rows,
+           dealii::ExcDimensionMismatch(n_rows, dest.n_rows));
+    Assert(n_columns == dest.n_columns,
+           dealii::ExcDimensionMismatch(n_columns, dest.n_columns));
+
+    if (this->grid->mpi_process_is_active)
+      AssertThrow(
+        this->descriptor[0] == 1,
+        dealii::ExcMessage(
+          "Copying of ScaLAPACK matrices only implemented for dense matrices"));
+    if (dest.grid->mpi_process_is_active)
+      AssertThrow(
+        dest.descriptor[0] == 1,
+        dealii::ExcMessage(
+          "Copying of ScaLAPACK matrices only implemented for dense matrices"));
+
+    /*
+     * just in case of different process grids or block-cyclic distributions
+     * inter-process communication is necessary
+     * if distributed matrices have the same process grid and block sizes, local
+     * copying is enough
+     */
+    if ((this->grid != dest.grid) || (row_block_size != dest.row_block_size) ||
+        (column_block_size != dest.column_block_size))
+      {
+        /*
+         * get the MPI communicator, which is the union of the source and
+         * destination MPI communicator
+         */
+        int       ierr = 0;
+        MPI_Group group_source, group_dest, group_union;
+        ierr = MPI_Comm_group(this->grid->mpi_communicator, &group_source);
+        AssertThrowMPI(ierr);
+        ierr = MPI_Comm_group(dest.grid->mpi_communicator, &group_dest);
+        AssertThrowMPI(ierr);
+        ierr = MPI_Group_union(group_source, group_dest, &group_union);
+        AssertThrowMPI(ierr);
+        MPI_Comm mpi_communicator_union;
+
+        // to create a communicator representing the union of the source
+        // and destination MPI communicator we need a communicator that
+        // is guaranteed to contain all desired processes -- i.e.,
+        // MPI_COMM_WORLD. on the other hand, as documented in the MPI
+        // standard, MPI_Comm_create_group is not collective on all
+        // processes in the first argument, but instead is collective on
+        // only those processes listed in the group. in other words,
+        // there is really no harm in passing MPI_COMM_WORLD as the
+        // first argument, even if the program we are currently running
+        // and that is calling this function only works on a subset of
+        // processes. the same holds for the wrapper/fallback we are using here.
+
+        const int mpi_tag =
+          dealii::Utilities::MPI::internal::Tags::scalapack_copy_to2;
+        ierr = dealii::Utilities::MPI::create_group(MPI_COMM_WORLD,
+                                                    group_union,
+                                                    mpi_tag,
+                                                    &mpi_communicator_union);
+        AssertThrowMPI(ierr);
+
+        /*
+         * The routine pgemr2d requires a BLACS context resembling at least the
+         * union of process grids described by the BLACS contexts of matrix A
+         * and
+         * B
+         */
+        int union_blacs_context = Csys2blacs_handle(mpi_communicator_union);
+        const char *order       = "Col";
+        int         union_n_process_rows =
+          dealii::Utilities::MPI::n_mpi_processes(mpi_communicator_union);
+        int union_n_process_columns = 1;
+        Cblacs_gridinit(&union_blacs_context,
+                        order,
+                        union_n_process_rows,
+                        union_n_process_columns);
+
+        const NumberType *loc_vals_source = nullptr;
+        NumberType *      loc_vals_dest   = nullptr;
+
+        if (this->grid->mpi_process_is_active && (this->values.size() > 0))
+          {
+            AssertThrow(this->values.size() > 0,
+                        dealii::ExcMessage(
+                          "source: process is active but local matrix empty"));
+            loc_vals_source = this->values.data();
+          }
+        if (dest.grid->mpi_process_is_active && (dest.values.size() > 0))
+          {
+            AssertThrow(
+              dest.values.size() > 0,
+              dealii::ExcMessage(
+                "destination: process is active but local matrix empty"));
+            loc_vals_dest = dest.values.data();
+          }
+        pgemr2d(&n_rows,
+                &n_columns,
+                loc_vals_source,
+                &submatrix_row,
+                &submatrix_column,
+                descriptor,
+                loc_vals_dest,
+                &dest.submatrix_row,
+                &dest.submatrix_column,
+                dest.descriptor,
+                &union_blacs_context);
+
+        Cblacs_gridexit(union_blacs_context);
+
+        if (mpi_communicator_union != MPI_COMM_NULL)
+          {
+            ierr = MPI_Comm_free(&mpi_communicator_union);
+            AssertThrowMPI(ierr);
+          }
+        ierr = MPI_Group_free(&group_source);
+        AssertThrowMPI(ierr);
+        ierr = MPI_Group_free(&group_dest);
+        AssertThrowMPI(ierr);
+        ierr = MPI_Group_free(&group_union);
+        AssertThrowMPI(ierr);
+      }
+    else
+      // process is active in the process grid
+      if (this->grid->mpi_process_is_active)
+      dest.values = this->values;
+
+    dest.state    = state;
+    dest.property = property;
+  }
+
+  template <typename NumberType>
+  void
+  ScaLAPACKMatrix<NumberType>::add(const ScaLAPACKMatrix<NumberType> &B,
+                                   const NumberType                   alpha,
+                                   const NumberType                   beta,
+                                   const bool transpose_B)
+  {
+    if (transpose_B)
+      {
+        Assert(n_rows == B.n_columns,
+               dealii::ExcDimensionMismatch(n_rows, B.n_columns));
+        Assert(n_columns == B.n_rows,
+               dealii::ExcDimensionMismatch(n_columns, B.n_rows));
+        Assert(column_block_size == B.row_block_size,
+               dealii::ExcDimensionMismatch(column_block_size,
+                                            B.row_block_size));
+        Assert(row_block_size == B.column_block_size,
+               dealii::ExcDimensionMismatch(row_block_size,
+                                            B.column_block_size));
+      }
+    else
+      {
+        Assert(n_rows == B.n_rows,
+               dealii::ExcDimensionMismatch(n_rows, B.n_rows));
+        Assert(n_columns == B.n_columns,
+               dealii::ExcDimensionMismatch(n_columns, B.n_columns));
+        Assert(column_block_size == B.column_block_size,
+               dealii::ExcDimensionMismatch(column_block_size,
+                                            B.column_block_size));
+        Assert(row_block_size == B.row_block_size,
+               dealii::ExcDimensionMismatch(row_block_size, B.row_block_size));
+      }
+    Assert(this->grid == B.grid,
+           dealii::ExcMessage(
+             "The matrices A and B need to have the same process grid"));
+
+    if (this->grid->mpi_process_is_active)
+      {
+        char        trans_b = transpose_B ? 'T' : 'N';
+        NumberType *A_loc =
+          (this->values.size() > 0) ? this->values.data() : nullptr;
+        const NumberType *B_loc =
+          (B.values.size() > 0) ? B.values.data() : nullptr;
+
+        pgeadd(&trans_b,
+               &n_rows,
+               &n_columns,
+               &beta,
+               B_loc,
+               &B.submatrix_row,
+               &B.submatrix_column,
+               B.descriptor,
+               &alpha,
+               A_loc,
+               &submatrix_row,
+               &submatrix_column,
+               descriptor);
+      }
+    state = dftefeScalapack::LAPACKSupport::matrix;
+  }
+
+  template <typename NumberType>
+  void
+  ScaLAPACKMatrix<NumberType>::zadd(const ScaLAPACKMatrix<NumberType> &B,
+                                    const NumberType                   alpha,
+                                    const NumberType                   beta,
+                                    const bool conjugate_transpose_B)
+  {
+    if (conjugate_transpose_B)
+      {
+        Assert(n_rows == B.n_columns,
+               dealii::ExcDimensionMismatch(n_rows, B.n_columns));
+        Assert(n_columns == B.n_rows,
+               dealii::ExcDimensionMismatch(n_columns, B.n_rows));
+        Assert(column_block_size == B.row_block_size,
+               dealii::ExcDimensionMismatch(column_block_size,
+                                            B.row_block_size));
+        Assert(row_block_size == B.column_block_size,
+               dealii::ExcDimensionMismatch(row_block_size,
+                                            B.column_block_size));
+      }
+    else
+      {
+        Assert(n_rows == B.n_rows,
+               dealii::ExcDimensionMismatch(n_rows, B.n_rows));
+        Assert(n_columns == B.n_columns,
+               dealii::ExcDimensionMismatch(n_columns, B.n_columns));
+        Assert(column_block_size == B.column_block_size,
+               dealii::ExcDimensionMismatch(column_block_size,
+                                            B.column_block_size));
+        Assert(row_block_size == B.row_block_size,
+               dealii::ExcDimensionMismatch(row_block_size, B.row_block_size));
+      }
+    Assert(this->grid == B.grid,
+           dealii::ExcMessage(
+             "The matrices A and B need to have the same process grid"));
+
+    if (this->grid->mpi_process_is_active)
+      {
+        char trans_b =
+          conjugate_transpose_B ?
+            (std::is_same<NumberType, std::complex<double>>::value ? 'C' :
+                                                                     'T') :
+            'N';
+        NumberType *A_loc =
+          (this->values.size() > 0) ? this->values.data() : nullptr;
+        const NumberType *B_loc =
+          (B.values.size() > 0) ? B.values.data() : nullptr;
+
+        pgeadd(&trans_b,
+               &n_rows,
+               &n_columns,
+               &beta,
+               B_loc,
+               &B.submatrix_row,
+               &B.submatrix_column,
+               B.descriptor,
+               &alpha,
+               A_loc,
+               &submatrix_row,
+               &submatrix_column,
+               descriptor);
+      }
+    state = dftefeScalapack::LAPACKSupport::matrix;
+  }
+
+  template <typename NumberType>
+  void
+  ScaLAPACKMatrix<NumberType>::copy_transposed(
+    const ScaLAPACKMatrix<NumberType> &B)
+  {
+    add(B, 0, 1, true);
+  }
+
+  template <typename NumberType>
+  void
+  ScaLAPACKMatrix<NumberType>::copy_conjugate_transposed(
+    const ScaLAPACKMatrix<NumberType> &B)
+  {
+    zadd(B, 0, 1, true);
+  }
+
+  template <typename NumberType>
+  void
+  ScaLAPACKMatrix<NumberType>::mult(const NumberType                   b,
+                                    const ScaLAPACKMatrix<NumberType> &B,
+                                    const NumberType                   c,
+                                    ScaLAPACKMatrix<NumberType> &      C,
+                                    const bool transpose_A,
+                                    const bool transpose_B) const
+  {
+    Assert(this->grid == B.grid,
+           dealii::ExcMessage(
+             "The matrices A and B need to have the same process grid"));
+    Assert(C.grid == B.grid,
+           dealii::ExcMessage(
+             "The matrices B and C need to have the same process grid"));
+
+    // see for further info:
+    // https://www.ibm.com/support/knowledgecenter/SSNR5K_4.2.0/com.ibm.cluster.pessl.v4r2.pssl100.doc/am6gr_lgemm.htm
+    if (!transpose_A && !transpose_B)
+      {
+        Assert(this->n_columns == B.n_rows,
+               dealii::ExcDimensionMismatch(this->n_columns, B.n_rows));
+        Assert(this->n_rows == C.n_rows,
+               dealii::ExcDimensionMismatch(this->n_rows, C.n_rows));
+        Assert(B.n_columns == C.n_columns,
+               dealii::ExcDimensionMismatch(B.n_columns, C.n_columns));
+        Assert(this->row_block_size == C.row_block_size,
+               dealii::ExcDimensionMismatch(this->row_block_size,
+                                            C.row_block_size));
+        Assert(this->column_block_size == B.row_block_size,
+               dealii::ExcDimensionMismatch(this->column_block_size,
+                                            B.row_block_size));
+        Assert(B.column_block_size == C.column_block_size,
+               dealii::ExcDimensionMismatch(B.column_block_size,
+                                            C.column_block_size));
+      }
+    else if (transpose_A && !transpose_B)
+      {
+        Assert(this->n_rows == B.n_rows,
+               dealii::ExcDimensionMismatch(this->n_rows, B.n_rows));
+        Assert(this->n_columns == C.n_rows,
+               dealii::ExcDimensionMismatch(this->n_columns, C.n_rows));
+        Assert(B.n_columns == C.n_columns,
+               dealii::ExcDimensionMismatch(B.n_columns, C.n_columns));
+        Assert(this->column_block_size == C.row_block_size,
+               dealii::ExcDimensionMismatch(this->column_block_size,
+                                            C.row_block_size));
+        Assert(this->row_block_size == B.row_block_size,
+               dealii::ExcDimensionMismatch(this->row_block_size,
+                                            B.row_block_size));
+        Assert(B.column_block_size == C.column_block_size,
+               dealii::ExcDimensionMismatch(B.column_block_size,
+                                            C.column_block_size));
+      }
+    else if (!transpose_A && transpose_B)
+      {
+        Assert(this->n_columns == B.n_columns,
+               dealii::ExcDimensionMismatch(this->n_columns, B.n_columns));
+        Assert(this->n_rows == C.n_rows,
+               dealii::ExcDimensionMismatch(this->n_rows, C.n_rows));
+        Assert(B.n_rows == C.n_columns,
+               dealii::ExcDimensionMismatch(B.n_rows, C.n_columns));
+        Assert(this->row_block_size == C.row_block_size,
+               dealii::ExcDimensionMismatch(this->row_block_size,
+                                            C.row_block_size));
+        Assert(this->column_block_size == B.column_block_size,
+               dealii::ExcDimensionMismatch(this->column_block_size,
+                                            B.column_block_size));
+        Assert(B.row_block_size == C.column_block_size,
+               dealii::ExcDimensionMismatch(B.row_block_size,
+                                            C.column_block_size));
+      }
+    else // if (transpose_A && transpose_B)
+      {
+        Assert(this->n_rows == B.n_columns,
+               dealii::ExcDimensionMismatch(this->n_rows, B.n_columns));
+        Assert(this->n_columns == C.n_rows,
+               dealii::ExcDimensionMismatch(this->n_columns, C.n_rows));
+        Assert(B.n_rows == C.n_columns,
+               dealii::ExcDimensionMismatch(B.n_rows, C.n_columns));
+        Assert(this->column_block_size == C.row_block_size,
+               dealii::ExcDimensionMismatch(this->row_block_size,
+                                            C.row_block_size));
+        Assert(this->row_block_size == B.column_block_size,
+               dealii::ExcDimensionMismatch(this->column_block_size,
+                                            B.row_block_size));
+        Assert(B.row_block_size == C.column_block_size,
+               dealii::ExcDimensionMismatch(B.column_block_size,
+                                            C.column_block_size));
+      }
+
+    if (this->grid->mpi_process_is_active)
+      {
+        char trans_a = transpose_A ? 'T' : 'N';
+        char trans_b = transpose_B ? 'T' : 'N';
+
+        const NumberType *A_loc =
+          (this->values.size() > 0) ? this->values.data() : nullptr;
+        const NumberType *B_loc =
+          (B.values.size() > 0) ? B.values.data() : nullptr;
+        NumberType *C_loc = (C.values.size() > 0) ? C.values.data() : nullptr;
+        int         m     = C.n_rows;
+        int         n     = C.n_columns;
+        int         k     = transpose_A ? this->n_rows : this->n_columns;
+
+        pgemm(&trans_a,
+              &trans_b,
+              &m,
+              &n,
+              &k,
+              &b,
+              A_loc,
+              &(this->submatrix_row),
+              &(this->submatrix_column),
+              this->descriptor,
+              B_loc,
+              &B.submatrix_row,
+              &B.submatrix_column,
+              B.descriptor,
+              &c,
+              C_loc,
+              &C.submatrix_row,
+              &C.submatrix_column,
+              C.descriptor);
+      }
+    C.state = dftefeScalapack::LAPACKSupport::matrix;
+  }
+
+  template <typename NumberType>
+  void
+  ScaLAPACKMatrix<NumberType>::zmult(const NumberType                   b,
+                                     const ScaLAPACKMatrix<NumberType> &B,
+                                     const NumberType                   c,
+                                     ScaLAPACKMatrix<NumberType> &      C,
+                                     const bool conjugate_transpose_A,
+                                     const bool conjugate_transpose_B) const
+  {
+    Assert(this->grid == B.grid,
+           dealii::ExcMessage(
+             "The matrices A and B need to have the same process grid"));
+    Assert(C.grid == B.grid,
+           dealii::ExcMessage(
+             "The matrices B and C need to have the same process grid"));
+
+    // see for further info:
+    // https://www.ibm.com/support/knowledgecenter/SSNR5K_4.2.0/com.ibm.cluster.pessl.v4r2.pssl100.doc/am6gr_lgemm.htm
+    if (!conjugate_transpose_A && !conjugate_transpose_B)
+      {
+        Assert(this->n_columns == B.n_rows,
+               dealii::ExcDimensionMismatch(this->n_columns, B.n_rows));
+        Assert(this->n_rows == C.n_rows,
+               dealii::ExcDimensionMismatch(this->n_rows, C.n_rows));
+        Assert(B.n_columns == C.n_columns,
+               dealii::ExcDimensionMismatch(B.n_columns, C.n_columns));
+        Assert(this->row_block_size == C.row_block_size,
+               dealii::ExcDimensionMismatch(this->row_block_size,
+                                            C.row_block_size));
+        Assert(this->column_block_size == B.row_block_size,
+               dealii::ExcDimensionMismatch(this->column_block_size,
+                                            B.row_block_size));
+        Assert(B.column_block_size == C.column_block_size,
+               dealii::ExcDimensionMismatch(B.column_block_size,
+                                            C.column_block_size));
+      }
+    else if (conjugate_transpose_A && !conjugate_transpose_B)
+      {
+        Assert(this->n_rows == B.n_rows,
+               dealii::ExcDimensionMismatch(this->n_rows, B.n_rows));
+        Assert(this->n_columns == C.n_rows,
+               dealii::ExcDimensionMismatch(this->n_columns, C.n_rows));
+        Assert(B.n_columns == C.n_columns,
+               dealii::ExcDimensionMismatch(B.n_columns, C.n_columns));
+        Assert(this->column_block_size == C.row_block_size,
+               dealii::ExcDimensionMismatch(this->column_block_size,
+                                            C.row_block_size));
+        Assert(this->row_block_size == B.row_block_size,
+               dealii::ExcDimensionMismatch(this->row_block_size,
+                                            B.row_block_size));
+        Assert(B.column_block_size == C.column_block_size,
+               dealii::ExcDimensionMismatch(B.column_block_size,
+                                            C.column_block_size));
+      }
+    else if (!conjugate_transpose_A && conjugate_transpose_B)
+      {
+        Assert(this->n_columns == B.n_columns,
+               dealii::ExcDimensionMismatch(this->n_columns, B.n_columns));
+        Assert(this->n_rows == C.n_rows,
+               dealii::ExcDimensionMismatch(this->n_rows, C.n_rows));
+        Assert(B.n_rows == C.n_columns,
+               dealii::ExcDimensionMismatch(B.n_rows, C.n_columns));
+        Assert(this->row_block_size == C.row_block_size,
+               dealii::ExcDimensionMismatch(this->row_block_size,
+                                            C.row_block_size));
+        Assert(this->column_block_size == B.column_block_size,
+               dealii::ExcDimensionMismatch(this->column_block_size,
+                                            B.column_block_size));
+        Assert(B.row_block_size == C.column_block_size,
+               dealii::ExcDimensionMismatch(B.row_block_size,
+                                            C.column_block_size));
+      }
+    else // if (transpose_A && transpose_B)
+      {
+        Assert(this->n_rows == B.n_columns,
+               dealii::ExcDimensionMismatch(this->n_rows, B.n_columns));
+        Assert(this->n_columns == C.n_rows,
+               dealii::ExcDimensionMismatch(this->n_columns, C.n_rows));
+        Assert(B.n_rows == C.n_columns,
+               dealii::ExcDimensionMismatch(B.n_rows, C.n_columns));
+        Assert(this->column_block_size == C.row_block_size,
+               dealii::ExcDimensionMismatch(this->row_block_size,
+                                            C.row_block_size));
+        Assert(this->row_block_size == B.column_block_size,
+               dealii::ExcDimensionMismatch(this->column_block_size,
+                                            B.row_block_size));
+        Assert(B.row_block_size == C.column_block_size,
+               dealii::ExcDimensionMismatch(B.column_block_size,
+                                            C.column_block_size));
+      }
+
+    if (this->grid->mpi_process_is_active)
+      {
+        char trans_a =
+          conjugate_transpose_A ?
+            (std::is_same<NumberType, std::complex<double>>::value ? 'C' :
+                                                                     'T') :
+            'N';
+        char trans_b =
+          conjugate_transpose_B ?
+            (std::is_same<NumberType, std::complex<double>>::value ? 'C' :
+                                                                     'T') :
+            'N';
+
+        const NumberType *A_loc =
+          (this->values.size() > 0) ? this->values.data() : nullptr;
+        const NumberType *B_loc =
+          (B.values.size() > 0) ? B.values.data() : nullptr;
+        NumberType *C_loc = (C.values.size() > 0) ? C.values.data() : nullptr;
+        int         m     = C.n_rows;
+        int         n     = C.n_columns;
+        int         k = conjugate_transpose_A ? this->n_rows : this->n_columns;
+
+        pgemm(&trans_a,
+              &trans_b,
+              &m,
+              &n,
+              &k,
+              &b,
+              A_loc,
+              &(this->submatrix_row),
+              &(this->submatrix_column),
+              this->descriptor,
+              B_loc,
+              &B.submatrix_row,
+              &B.submatrix_column,
+              B.descriptor,
+              &c,
+              C_loc,
+              &C.submatrix_row,
+              &C.submatrix_column,
+              C.descriptor);
+      }
+    C.state = dftefeScalapack::LAPACKSupport::matrix;
+  }
+
+  template <typename NumberType>
+  void
+  ScaLAPACKMatrix<NumberType>::mmult(ScaLAPACKMatrix<NumberType> &      C,
+                                     const ScaLAPACKMatrix<NumberType> &B,
+                                     const bool adding) const
+  {
+    if (adding)
+      mult(1., B, 1., C, false, false);
+    else
+      mult(1., B, 0, C, false, false);
+  }
+
+
+
+  template <typename NumberType>
+  void
+  ScaLAPACKMatrix<NumberType>::Tmmult(ScaLAPACKMatrix<NumberType> &      C,
+                                      const ScaLAPACKMatrix<NumberType> &B,
+                                      const bool adding) const
+  {
+    if (adding)
+      mult(1., B, 1., C, true, false);
+    else
+      mult(1., B, 0, C, true, false);
+  }
+
+
+
+  template <typename NumberType>
+  void
+  ScaLAPACKMatrix<NumberType>::mTmult(ScaLAPACKMatrix<NumberType> &      C,
+                                      const ScaLAPACKMatrix<NumberType> &B,
+                                      const bool adding) const
+  {
+    if (adding)
+      mult(1., B, 1., C, false, true);
+    else
+      mult(1., B, 0, C, false, true);
+  }
+
+
+
+  template <typename NumberType>
+  void
+  ScaLAPACKMatrix<NumberType>::TmTmult(ScaLAPACKMatrix<NumberType> &      C,
+                                       const ScaLAPACKMatrix<NumberType> &B,
+                                       const bool adding) const
+  {
+    if (adding)
+      mult(1., B, 1., C, true, true);
+    else
+      mult(1., B, 0, C, true, true);
+  }
+
+
+  template <typename NumberType>
+  void
+  ScaLAPACKMatrix<NumberType>::zmmult(ScaLAPACKMatrix<NumberType> &      C,
+                                      const ScaLAPACKMatrix<NumberType> &B,
+                                      const bool adding) const
+  {
+    if (adding)
+      zmult(1., B, 1., C, false, false);
+    else
+      zmult(1., B, 0, C, false, false);
+  }
+
+
+
+  template <typename NumberType>
+  void
+  ScaLAPACKMatrix<NumberType>::zCmmult(ScaLAPACKMatrix<NumberType> &      C,
+                                       const ScaLAPACKMatrix<NumberType> &B,
+                                       const bool adding) const
+  {
+    if (adding)
+      zmult(1., B, 1., C, true, false);
+    else
+      zmult(1., B, 0, C, true, false);
+  }
+
+
+
+  template <typename NumberType>
+  void
+  ScaLAPACKMatrix<NumberType>::zmCmult(ScaLAPACKMatrix<NumberType> &      C,
+                                       const ScaLAPACKMatrix<NumberType> &B,
+                                       const bool adding) const
+  {
+    if (adding)
+      zmult(1., B, 1., C, false, true);
+    else
+      zmult(1., B, 0, C, false, true);
+  }
+
+
+
+  template <typename NumberType>
+  void
+  ScaLAPACKMatrix<NumberType>::zCmCmult(ScaLAPACKMatrix<NumberType> &      C,
+                                        const ScaLAPACKMatrix<NumberType> &B,
+                                        const bool adding) const
+  {
+    if (adding)
+      zmult(1., B, 1., C, true, true);
+    else
+      zmult(1., B, 0, C, true, true);
+  }
+
+  template <typename NumberType>
+  void
+  ScaLAPACKMatrix<NumberType>::compute_cholesky_factorization()
+  {
+    Assert(
+      n_columns == n_rows && property == LAPACKSupport::Property::hermitian,
+      dealii::ExcMessage(
+        "Cholesky factorization can be applied to hermitian matrices only."));
+    Assert(state == LAPACKSupport::matrix,
+           dealii::ExcMessage(
+             "Matrix has to be in Matrix state before calling this function."));
+
+    if (grid->mpi_process_is_active)
+      {
+        int         info  = 0;
+        NumberType *A_loc = this->values.data();
+        // pdpotrf_(&uplo,&n_columns,A_loc,&submatrix_row,&submatrix_column,descriptor,&info);
+        ppotrf(&uplo,
+               &n_columns,
+               A_loc,
+               &submatrix_row,
+               &submatrix_column,
+               descriptor,
+               &info);
+        AssertThrow(info == 0,
+                    dftefeScalapack::LAPACKSupport::ExcErrorCode("ppotrf",
+                                                                 info));
+      }
+    state    = dftefeScalapack::LAPACKSupport::cholesky;
+    property = (uplo == 'L' ? dftefeScalapack::LAPACKSupport::lower_triangular :
+                              dftefeScalapack::LAPACKSupport::upper_triangular);
+  }
+
+  template <typename NumberType>
+  void
+  ScaLAPACKMatrix<NumberType>::compute_lu_factorization()
+  {
+    Assert(state == LAPACKSupport::matrix,
+           dealii::ExcMessage(
+             "Matrix has to be in Matrix state before calling this function."));
+
+    if (grid->mpi_process_is_active)
+      {
+        int         info  = 0;
+        NumberType *A_loc = this->values.data();
+
+        const int iarow = indxg2p_(&submatrix_row,
+                                   &row_block_size,
+                                   &(grid->this_process_row),
+                                   &first_process_row,
+                                   &(grid->n_process_rows));
+        const int mp    = numroc_(&n_rows,
+                               &row_block_size,
+                               &(grid->this_process_row),
+                               &iarow,
+                               &(grid->n_process_rows));
+        ipiv.resize(mp + row_block_size);
+
+        pgetrf(&n_rows,
+               &n_columns,
+               A_loc,
+               &submatrix_row,
+               &submatrix_column,
+               descriptor,
+               ipiv.data(),
+               &info);
+        AssertThrow(info == 0,
+                    dftefeScalapack::LAPACKSupport::ExcErrorCode("pgetrf",
+                                                                 info));
+      }
+    state    = dftefeScalapack::LAPACKSupport::State::lu;
+    property = dftefeScalapack::LAPACKSupport::Property::general;
+  }
+
+  template <typename NumberType>
+  void
+  ScaLAPACKMatrix<NumberType>::invert()
+  {
+    // Check whether matrix is hermitian and save flag.
+    // If a Cholesky factorization has been applied previously,
+    // the original matrix was hermitian.
+    const bool is_hermitian =
+      (property == dftefeScalapack::LAPACKSupport::hermitian ||
+       state == dftefeScalapack::LAPACKSupport::State::cholesky);
+
+    // Check whether matrix is triangular and is in an unfactorized state.
+    const bool is_triangular =
+      (property == dftefeScalapack::LAPACKSupport::upper_triangular ||
+       property == dftefeScalapack::LAPACKSupport::lower_triangular) &&
+      (state == dftefeScalapack::LAPACKSupport::State::matrix ||
+       state == dftefeScalapack::LAPACKSupport::State::inverse_matrix);
+
+    if (is_triangular)
+      {
+        if (grid->mpi_process_is_active)
+          {
+            const char uploTriangular =
+              property == dftefeScalapack::LAPACKSupport::upper_triangular ?
+                'U' :
+                'L';
+            const char  diag  = 'N';
+            int         info  = 0;
+            NumberType *A_loc = this->values.data();
+            ptrtri(&uploTriangular,
+                   &diag,
+                   &n_columns,
+                   A_loc,
+                   &submatrix_row,
+                   &submatrix_column,
+                   descriptor,
+                   &info);
+            AssertThrow(info == 0,
+                        dftefeScalapack::LAPACKSupport::ExcErrorCode("ptrtri",
+                                                                     info));
+            // The inversion is stored in the same part as the triangular
+            // matrix, so we don't need to re-set the property here.
+          }
+      }
+    else
+      {
+        // Matrix is neither in Cholesky nor LU state.
+        // Compute the required factorizations based on the property of the
+        // matrix.
+        if (!(state == dftefeScalapack::LAPACKSupport::State::lu ||
+              state == dftefeScalapack::LAPACKSupport::State::cholesky))
+          {
+            if (is_hermitian)
+              compute_cholesky_factorization();
+            else
+              compute_lu_factorization();
+          }
+        if (grid->mpi_process_is_active)
+          {
+            int         info  = 0;
+            NumberType *A_loc = this->values.data();
+
+            if (is_hermitian)
+              {
+                ppotri(&uplo,
+                       &n_columns,
+                       A_loc,
+                       &submatrix_row,
+                       &submatrix_column,
+                       descriptor,
+                       &info);
+                AssertThrow(
+                  info == 0,
+                  dftefeScalapack::LAPACKSupport::ExcErrorCode("ppotri", info));
+                property = dftefeScalapack::LAPACKSupport::Property::hermitian;
+              }
+            else
+              {
+                int lwork = -1, liwork = -1;
+                work.resize(1);
+                iwork.resize(1);
+
+                pgetri(&n_columns,
+                       A_loc,
+                       &submatrix_row,
+                       &submatrix_column,
+                       descriptor,
+                       ipiv.data(),
+                       work.data(),
+                       &lwork,
+                       iwork.data(),
+                       &liwork,
+                       &info);
+
+                AssertThrow(
+                  info == 0,
+                  dftefeScalapack::LAPACKSupport::ExcErrorCode("pgetri", info));
+                lwork  = lworkFromWork(work);
+                liwork = iwork[0];
+                work.resize(lwork);
+                iwork.resize(liwork);
+
+                pgetri(&n_columns,
+                       A_loc,
+                       &submatrix_row,
+                       &submatrix_column,
+                       descriptor,
+                       ipiv.data(),
+                       work.data(),
+                       &lwork,
+                       iwork.data(),
+                       &liwork,
+                       &info);
+
+                AssertThrow(
+                  info == 0,
+                  dftefeScalapack::LAPACKSupport::ExcErrorCode("pgetri", info));
+              }
+          }
+      }
+    state = dftefeScalapack::LAPACKSupport::State::inverse_matrix;
+  }
+
+  template <typename NumberType>
+  std::vector<double>
+  ScaLAPACKMatrix<NumberType>::eigenpairs_hermitian_by_index(
+    const std::pair<unsigned int, unsigned int> &index_limits,
+    const bool                                   compute_eigenvectors)
+  {
+    // check validity of index limits
+    AssertIndexRange(index_limits.first, n_rows);
+    AssertIndexRange(index_limits.second, n_rows);
+
+    std::pair<unsigned int, unsigned int> idx =
+      std::make_pair(std::min(index_limits.first, index_limits.second),
+                     std::max(index_limits.first, index_limits.second));
+
+    // compute all eigenvalues/eigenvectors
+    if (idx.first == 0 && idx.second == static_cast<unsigned int>(n_rows - 1))
+      return eigenpairs_hermitian(compute_eigenvectors);
+    else
+      return eigenpairs_hermitian(compute_eigenvectors, idx);
+  }
+
+
+  template <typename NumberType>
+  std::vector<double>
+  ScaLAPACKMatrix<NumberType>::eigenpairs_hermitian(
+    const bool                                   compute_eigenvectors,
+    const std::pair<unsigned int, unsigned int> &eigenvalue_idx,
+    const std::pair<double, double> &            eigenvalue_limits)
+  {
+    Assert(state == dftefeScalapack::LAPACKSupport::matrix,
+           dealii::ExcMessage(
+             "Matrix has to be in Matrix state before calling this function."));
+    Assert(property == dftefeScalapack::LAPACKSupport::hermitian,
+           dealii::ExcMessage(
+             "Matrix has to be hermitian for this operation."));
+
+    std::lock_guard<std::mutex> lock(mutex);
+
+    const bool use_values = (std::isnan(eigenvalue_limits.first) ||
+                             std::isnan(eigenvalue_limits.second)) ?
+                              false :
+                              true;
+    const bool use_indices =
+      ((eigenvalue_idx.first == dealii::numbers::invalid_unsigned_int) ||
+       (eigenvalue_idx.second == dealii::numbers::invalid_unsigned_int)) ?
+        false :
+        true;
+
+    Assert(
+      !(use_values && use_indices),
+      dealii::ExcMessage(
+        "Prescribing both the index and value range for the eigenvalues is ambiguous"));
+
+    // if computation of eigenvectors is not required use a sufficiently small
+    // distributed matrix
+    std::unique_ptr<ScaLAPACKMatrix<NumberType>> eigenvectors =
+      compute_eigenvectors ?
+        std::make_unique<ScaLAPACKMatrix<NumberType>>(n_rows,
+                                                      grid,
+                                                      row_block_size) :
+        std::make_unique<ScaLAPACKMatrix<NumberType>>(
+          grid->n_process_rows, grid->n_process_columns, grid, 1, 1);
+
+    eigenvectors->property = property;
+    // number of eigenvalues to be returned from psyevx; upon successful exit ev
+    // contains the m seclected eigenvalues in ascending order set to all
+    // eigenvaleus in case we will be using psyev.
+    int                 m = n_rows;
+    std::vector<double> ev(n_rows);
+
+    if (grid->mpi_process_is_active)
+      {
+        int info = 0;
+        /*
+         * for jobz==N only eigenvalues are computed, for jobz='V' also the
+         * eigenvectors of the matrix are computed
+         */
+        char jobz  = compute_eigenvectors ? 'V' : 'N';
+        char range = 'A';
+        // default value is to compute all eigenvalues and optionally
+        // eigenvectors
+        bool   all_eigenpairs = true;
+        double vl             = 0.0;
+        double vu             = 0.0;
+        int    il = 1, iu = 1;
+        // number of eigenvectors to be returned;
+        // upon successful exit the first m=nz columns contain the selected
+        // eigenvectors (only if jobz=='V')
+        int    nz     = 0;
+        double abstol = 0.0;
+
+        // orfac decides which eigenvectors should be reorthogonalized
+        // see
+        // http://www.netlib.org/scalapack/explore-html/df/d1a/pdsyevx_8f_source.html
+        // for explanation to keeps simple no reorthogonalized will be done by
+        // setting orfac to 0
+        double orfac = 0;
+        // contains the indices of eigenvectors that failed to converge
+        std::vector<int> ifail;
+        // This array contains indices of eigenvectors corresponding to
+        // a cluster of eigenvalues that could not be reorthogonalized
+        // due to insufficient workspace
+        // see
+        // http://www.netlib.org/scalapack/explore-html/df/d1a/pdsyevx_8f_source.html
+        // for explanation
+        std::vector<int> iclustr;
+        // This array contains the gap between eigenvalues whose
+        // eigenvectors could not be reorthogonalized.
+        // see
+        // http://www.netlib.org/scalapack/explore-html/df/d1a/pdsyevx_8f_source.html
+        // for explanation
+        std::vector<double> gap(n_local_rows * n_local_columns);
+
+        // index range for eigenvalues is not specified
+        if (!use_indices)
+          {
+            // interval for eigenvalues is not specified and consequently all
+            // eigenvalues/eigenpairs will be computed
+            if (!use_values)
+              {
+                range          = 'A';
+                all_eigenpairs = true;
+              }
+            else
+              {
+                range          = 'V';
+                all_eigenpairs = false;
+                vl =
+                  std::min(eigenvalue_limits.first, eigenvalue_limits.second);
+                vu =
+                  std::max(eigenvalue_limits.first, eigenvalue_limits.second);
+              }
+          }
+        else
+          {
+            range          = 'I';
+            all_eigenpairs = false;
+            // as Fortran starts counting/indexing from 1 unlike C/C++, where it
+            // starts from 0
+            il = std::min(eigenvalue_idx.first, eigenvalue_idx.second) + 1;
+            iu = std::max(eigenvalue_idx.first, eigenvalue_idx.second) + 1;
+          }
+        NumberType *A_loc = this->values.data();
+        /*
+         * by setting lwork to -1 a workspace query for optimal length of work
+         * is performed
+         */
+        int         lwork  = -1;
+        int         liwork = -1;
+        NumberType *eigenvectors_loc =
+          (compute_eigenvectors ? eigenvectors->values.data() : nullptr);
+        work.resize(1);
+        iwork.resize(1);
+
+        if (all_eigenpairs)
+          {
+            psyev(&jobz,
+                  &uplo,
+                  &n_rows,
+                  A_loc,
+                  &submatrix_row,
+                  &submatrix_column,
+                  descriptor,
+                  ev.data(),
+                  eigenvectors_loc,
+                  &eigenvectors->submatrix_row,
+                  &eigenvectors->submatrix_column,
+                  eigenvectors->descriptor,
+                  work.data(),
+                  &lwork,
+                  &info);
+            AssertThrow(info == 0,
+                        dftefeScalapack::LAPACKSupport::ExcErrorCode("psyev",
+                                                                     info));
+          }
+        else
+          {
+            char cmach = compute_eigenvectors ? 'U' : 'S';
+            plamch(&(this->grid->blacs_context), &cmach, abstol);
+            abstol *= 2;
+            ifail.resize(n_rows);
+            iclustr.resize(2 * grid->n_process_rows * grid->n_process_columns);
+            gap.resize(grid->n_process_rows * grid->n_process_columns);
+
+            psyevx(&jobz,
+                   &range,
+                   &uplo,
+                   &n_rows,
+                   A_loc,
+                   &submatrix_row,
+                   &submatrix_column,
+                   descriptor,
+                   &vl,
+                   &vu,
+                   &il,
+                   &iu,
+                   &abstol,
+                   &m,
+                   &nz,
+                   ev.data(),
+                   &orfac,
+                   eigenvectors_loc,
+                   &eigenvectors->submatrix_row,
+                   &eigenvectors->submatrix_column,
+                   eigenvectors->descriptor,
+                   work.data(),
+                   &lwork,
+                   iwork.data(),
+                   &liwork,
+                   ifail.data(),
+                   iclustr.data(),
+                   gap.data(),
+                   &info);
+            AssertThrow(info == 0,
+                        dftefeScalapack::LAPACKSupport::ExcErrorCode("psyevx",
+                                                                     info));
+          }
+        lwork = lworkFromWork(work);
+        work.resize(lwork);
+
+        if (all_eigenpairs)
+          {
+            psyev(&jobz,
+                  &uplo,
+                  &n_rows,
+                  A_loc,
+                  &submatrix_row,
+                  &submatrix_column,
+                  descriptor,
+                  ev.data(),
+                  eigenvectors_loc,
+                  &eigenvectors->submatrix_row,
+                  &eigenvectors->submatrix_column,
+                  eigenvectors->descriptor,
+                  work.data(),
+                  &lwork,
+                  &info);
+
+            AssertThrow(info == 0,
+                        dftefeScalapack::LAPACKSupport::ExcErrorCode("psyev",
+                                                                     info));
+          }
+        else
+          {
+            liwork = iwork[0];
+            AssertThrow(liwork > 0, dealii::ExcInternalError());
+            iwork.resize(liwork);
+
+            psyevx(&jobz,
+                   &range,
+                   &uplo,
+                   &n_rows,
+                   A_loc,
+                   &submatrix_row,
+                   &submatrix_column,
+                   descriptor,
+                   &vl,
+                   &vu,
+                   &il,
+                   &iu,
+                   &abstol,
+                   &m,
+                   &nz,
+                   ev.data(),
+                   &orfac,
+                   eigenvectors_loc,
+                   &eigenvectors->submatrix_row,
+                   &eigenvectors->submatrix_column,
+                   eigenvectors->descriptor,
+                   work.data(),
+                   &lwork,
+                   iwork.data(),
+                   &liwork,
+                   ifail.data(),
+                   iclustr.data(),
+                   gap.data(),
+                   &info);
+
+            AssertThrow(info == 0,
+                        dftefeScalapack::LAPACKSupport::ExcErrorCode("psyevx",
+                                                                     info));
+          }
+        // if eigenvectors are queried copy eigenvectors to original matrix
+        // as the temporary matrix eigenvectors has identical dimensions and
+        // block-cyclic distribution we simply swap the local array
+        if (compute_eigenvectors)
+          this->values.swap(eigenvectors->values);
+
+        // adapt the size of ev to fit m upon return
+        while (ev.size() > static_cast<size_type>(m))
+          ev.pop_back();
+      }
+    /*
+     * send number of computed eigenvalues to inactive processes
+     */
+    grid->send_to_inactive(&m, 1);
+
+    /*
+     * inactive processes have to resize array of eigenvalues
+     */
+    if (!grid->mpi_process_is_active)
+      ev.resize(m);
+    /*
+     * send the eigenvalues to processors not being part of the process grid
+     */
+    grid->send_to_inactive(ev.data(), ev.size());
+
+    /*
+     * if only eigenvalues are queried the content of the matrix will be
+     * destroyed if the eigenpairs are queried matrix A on exit stores the
+     * eigenvectors in the columns
+     */
+    if (compute_eigenvectors)
+      {
+        property = dftefeScalapack::LAPACKSupport::Property::general;
+        state    = dftefeScalapack::LAPACKSupport::eigenvalues;
+      }
+    else
+      state = dftefeScalapack::LAPACKSupport::unusable;
+
+    return ev;
+  }
+
+
+  template <typename NumberType>
+  std::vector<double>
+  ScaLAPACKMatrix<NumberType>::eigenpairs_hermitian_by_index_MRRR(
+    const std::pair<unsigned int, unsigned int> &index_limits,
+    const bool                                   compute_eigenvectors)
+  {
+    // Check validity of index limits.
+    AssertIndexRange(index_limits.first, static_cast<unsigned int>(n_rows));
+    AssertIndexRange(index_limits.second, static_cast<unsigned int>(n_rows));
+
+    const std::pair<unsigned int, unsigned int> idx =
+      std::make_pair(std::min(index_limits.first, index_limits.second),
+                     std::max(index_limits.first, index_limits.second));
+
+    // Compute all eigenvalues/eigenvectors.
+    if (idx.first == 0 && idx.second == static_cast<unsigned int>(n_rows - 1))
+      return eigenpairs_hermitian_MRRR(compute_eigenvectors);
+    else
+      return eigenpairs_hermitian_MRRR(compute_eigenvectors, idx);
+  }
+
+
+  template <typename NumberType>
+  std::vector<double>
+  ScaLAPACKMatrix<NumberType>::eigenpairs_hermitian_MRRR(
+    const bool                                   compute_eigenvectors,
+    const std::pair<unsigned int, unsigned int> &eigenvalue_idx,
+    const std::pair<double, double> &            eigenvalue_limits)
+  {
+    Assert(state == dftefeScalapack::LAPACKSupport::matrix,
+           dealii::ExcMessage(
+             "Matrix has to be in Matrix state before calling this function."));
+    Assert(property == dftefeScalapack::LAPACKSupport::hermitian,
+           dealii::ExcMessage(
+             "Matrix has to be hermitian for this operation."));
+
+    std::lock_guard<std::mutex> lock(mutex);
+
+    const bool use_values = (std::isnan(eigenvalue_limits.first) ||
+                             std::isnan(eigenvalue_limits.second)) ?
+                              false :
+                              true;
+    const bool use_indices =
+      ((eigenvalue_idx.first == dealii::numbers::invalid_unsigned_int) ||
+       (eigenvalue_idx.second == dealii::numbers::invalid_unsigned_int)) ?
+        false :
+        true;
+
+    Assert(
+      !(use_values && use_indices),
+      dealii::ExcMessage(
+        "Prescribing both the index and value range for the eigenvalues is ambiguous"));
+
+    // If computation of eigenvectors is not required, use a sufficiently small
+    // distributed matrix.
+    std::unique_ptr<ScaLAPACKMatrix<NumberType>> eigenvectors =
+      compute_eigenvectors ?
+        std::make_unique<ScaLAPACKMatrix<NumberType>>(n_rows,
+                                                      grid,
+                                                      row_block_size) :
+        std::make_unique<ScaLAPACKMatrix<NumberType>>(
+          grid->n_process_rows, grid->n_process_columns, grid, 1, 1);
+
+    eigenvectors->property = property;
+    // Number of eigenvalues to be returned from psyevr; upon successful exit ev
+    // contains the m seclected eigenvalues in ascending order.
+    int                 m = n_rows;
+    std::vector<double> ev(n_rows);
+
+    // Number of eigenvectors to be returned;
+    // Upon successful exit the first m=nz columns contain the selected
+    // eigenvectors (only if jobz=='V').
+    int nz = 0;
+
+    if (grid->mpi_process_is_active)
+      {
+        int info = 0;
+        /*
+         * For jobz==N only eigenvalues are computed, for jobz='V' also the
+         * eigenvectors of the matrix are computed.
+         */
+        char jobz = compute_eigenvectors ? 'V' : 'N';
+        // Default value is to compute all eigenvalues and optionally
+        // eigenvectors.
+        char   range = 'A';
+        double vl    = 0.0;
+        double vu    = 0.0;
+        int    il = 1, iu = 1;
+
+        // Index range for eigenvalues is not specified.
+        if (!use_indices)
+          {
+            // Interval for eigenvalues is not specified and consequently all
+            // eigenvalues/eigenpairs will be computed.
+            if (!use_values)
+              {
+                range = 'A';
+              }
+            else
+              {
+                range = 'V';
+                vl =
+                  std::min(eigenvalue_limits.first, eigenvalue_limits.second);
+                vu =
+                  std::max(eigenvalue_limits.first, eigenvalue_limits.second);
+              }
+          }
+        else
+          {
+            range = 'I';
+            // As Fortran starts counting/indexing from 1 unlike C/C++, where it
+            // starts from 0.
+            il = std::min(eigenvalue_idx.first, eigenvalue_idx.second) + 1;
+            iu = std::max(eigenvalue_idx.first, eigenvalue_idx.second) + 1;
+          }
+        NumberType *A_loc = this->values.data();
+
+        /*
+         * By setting lwork to -1 a workspace query for optimal length of work
+         * is performed.
+         */
+        int         lwork  = -1;
+        int         liwork = -1;
+        NumberType *eigenvectors_loc =
+          (compute_eigenvectors ? eigenvectors->values.data() : nullptr);
+        work.resize(1);
+        iwork.resize(1);
+
+        psyevr(&jobz,
+               &range,
+               &uplo,
+               &n_rows,
+               A_loc,
+               &submatrix_row,
+               &submatrix_column,
+               descriptor,
+               &vl,
+               &vu,
+               &il,
+               &iu,
+               &m,
+               &nz,
+               ev.data(),
+               eigenvectors_loc,
+               &eigenvectors->submatrix_row,
+               &eigenvectors->submatrix_column,
+               eigenvectors->descriptor,
+               work.data(),
+               &lwork,
+               iwork.data(),
+               &liwork,
+               &info);
+        AssertThrow(info == 0,
+                    dftefeScalapack::LAPACKSupport::ExcErrorCode("psyevr",
+                                                                 info));
+
+        lwork = lworkFromWork(work);
+        work.resize(lwork);
+        liwork = iwork[0];
+        iwork.resize(liwork);
+
+        psyevr(&jobz,
+               &range,
+               &uplo,
+               &n_rows,
+               A_loc,
+               &submatrix_row,
+               &submatrix_column,
+               descriptor,
+               &vl,
+               &vu,
+               &il,
+               &iu,
+               &m,
+               &nz,
+               ev.data(),
+               eigenvectors_loc,
+               &eigenvectors->submatrix_row,
+               &eigenvectors->submatrix_column,
+               eigenvectors->descriptor,
+               work.data(),
+               &lwork,
+               iwork.data(),
+               &liwork,
+               &info);
+
+        AssertThrow(info == 0,
+                    dftefeScalapack::LAPACKSupport::ExcErrorCode("psyevr",
+                                                                 info));
+
+        if (compute_eigenvectors)
+          AssertThrow(
+            m == nz,
+            dealii::ExcMessage(
+              "psyevr failed to compute all eigenvectors for the selected eigenvalues"));
+
+        // If eigenvectors are queried, copy eigenvectors to original matrix.
+        // As the temporary matrix eigenvectors has identical dimensions and
+        // block-cyclic distribution we simply swap the local array.
+        if (compute_eigenvectors)
+          this->values.swap(eigenvectors->values);
+
+        // Adapt the size of ev to fit m upon return.
+        while (ev.size() > static_cast<size_type>(m))
+          ev.pop_back();
+      }
+    /*
+     * Send number of computed eigenvalues to inactive processes.
+     */
+    grid->send_to_inactive(&m, 1);
+
+    /*
+     * Inactive processes have to resize array of eigenvalues.
+     */
+    if (!grid->mpi_process_is_active)
+      ev.resize(m);
+    /*
+     * Send the eigenvalues to processors not being part of the process grid.
+     */
+    grid->send_to_inactive(ev.data(), ev.size());
+
+    /*
+     * If only eigenvalues are queried, the content of the matrix will be
+     * destroyed. If the eigenpairs are queried, matrix A on exit stores the
+     * eigenvectors in the columns.
+     */
+    if (compute_eigenvectors)
+      {
+        property = dftefeScalapack::LAPACKSupport::Property::general;
+        state    = dftefeScalapack::LAPACKSupport::eigenvalues;
+      }
+    else
+      state = dftefeScalapack::LAPACKSupport::unusable;
+
+    return ev;
+  }
+
+
+  template class ScaLAPACKMatrix<double>;
+  template class ScaLAPACKMatrix<std::complex<double>>;
+} // namespace dftefeScalapack
diff --git a/src/linearAlgebra/DO_NOT_USE_ScalapckWrapper/scalapackWrapper.h b/src/linearAlgebra/DO_NOT_USE_ScalapckWrapper/scalapackWrapper.h
new file mode 100644
index 000000000..4f97d0e7d
--- /dev/null
+++ b/src/linearAlgebra/DO_NOT_USE_ScalapckWrapper/scalapackWrapper.h
@@ -0,0 +1,802 @@
+// ---------------------------------------------------------------------
+//
+// Copyright (c) 2019-2020  The Regents of the University of Michigan and DFT-FE
+// authors.
+//
+// This file is part of the DFT-FE code.
+//
+// The DFT-FE code is free software; you can use it, redistribute
+// it, and/or modify it under the terms of the GNU Lesser General
+// Public License as published by the Free Software Foundation; either
+// version 2.1 of the License, or (at your option) any later version.
+// The full text of the license can be found in the file LICENSE at
+// the top level of the DFT-FE distribution.
+//
+// ---------------------------------------------------------------------
+//
+
+
+
+#ifndef ScaLAPACKMatrix_H_
+#define ScaLAPACKMatrix_H_
+
+#include "process_grid.h"
+#include "lapack_support.h"
+#include <memory>
+#include <vector>
+#include <deal.II/base/thread_management.h>
+
+
+namespace dftefeScalapack
+{
+  /**
+   * @brief Scalapack wrapper adapted from dealii library and extended implementation to complex datatype
+   *
+   * @author Sambit Das
+   */
+  template <typename NumberType>
+  class ScaLAPACKMatrix
+  {
+  public:
+    /**
+     * Declare the type for container size.
+     */
+    using size_type = unsigned int;
+
+    /**
+     * Constructor for a rectangular matrix with @p n_rows and @p n_cols
+     * and distributed using the grid @p process_grid.
+     *
+     * The parameters @p row_block_size and @p column_block_size are the block sizes used
+     * for the block-cyclic distribution of the matrix.
+     * In general, it is recommended to use powers of $2$, e.g. $16,32,64,
+     * \dots$.
+     */
+    ScaLAPACKMatrix(
+      const size_type                                            n_rows,
+      const size_type                                            n_columns,
+      const std::shared_ptr<const dftefeScalapack::ProcessGrid> &process_grid,
+      const size_type                                row_block_size    = 32,
+      const size_type                                column_block_size = 32,
+      const dftefeScalapack::LAPACKSupport::Property property =
+        dftefeScalapack::LAPACKSupport::Property::general);
+
+    /**
+     * Constructor for a square matrix of size @p size, and distributed
+     * using the process grid in @p process_grid.
+     *
+     * The parameter @p block_size is used for the block-cyclic distribution of the matrix.
+     * An identical block size is used for the rows and columns of the matrix.
+     * In general, it is recommended to use powers of $2$, e.g. $16,32,64,
+     * \dots$.
+     */
+    ScaLAPACKMatrix(
+      const size_type                                            size,
+      const std::shared_ptr<const dftefeScalapack::ProcessGrid> &process_grid,
+      const size_type                                block_size = 32,
+      const dftefeScalapack::LAPACKSupport::Property property =
+        dftefeScalapack::LAPACKSupport::Property::hermitian);
+
+
+    /**
+     * Initialize the rectangular matrix with @p n_rows and @p n_cols
+     * and distributed using the grid @p process_grid.
+     *
+     * The parameters @p row_block_size and @p column_block_size are the block sizes used
+     * for the block-cyclic distribution of the matrix.
+     * In general, it is recommended to use powers of $2$, e.g. $16,32,64,
+     * \dots$.
+     */
+    void
+    reinit(
+      const size_type                                            n_rows,
+      const size_type                                            n_columns,
+      const std::shared_ptr<const dftefeScalapack::ProcessGrid> &process_grid,
+      const size_type                                row_block_size    = 32,
+      const size_type                                column_block_size = 32,
+      const dftefeScalapack::LAPACKSupport::Property property =
+        dftefeScalapack::LAPACKSupport::Property::general);
+
+    /**
+     * Initialize the square matrix of size @p size and distributed using the grid @p process_grid.
+     *
+     * The parameter @p block_size is used for the block-cyclic distribution of the matrix.
+     * An identical block size is used for the rows and columns of the matrix.
+     * In general, it is recommended to use powers of $2$, e.g. $16,32,64,
+     * \dots$.
+     */
+    void
+    reinit(
+      const size_type                                            size,
+      const std::shared_ptr<const dftefeScalapack::ProcessGrid> &process_grid,
+      const size_type                                block_size = 32,
+      const dftefeScalapack::LAPACKSupport::Property property =
+        dftefeScalapack::LAPACKSupport::Property::hermitian);
+
+
+    /**
+     * Assign @p property to this matrix.
+     */
+    void
+    set_property(const dftefeScalapack::LAPACKSupport::Property property);
+
+    /**
+     * Return current @p property of this matrix
+     */
+    dftefeScalapack::LAPACKSupport::Property
+    get_property() const;
+
+    /**
+     * Return current @p state of this matrix
+     */
+    dftefeScalapack::LAPACKSupport::State
+    get_state() const;
+
+
+    /**
+     * Copy the contents of the distributed matrix into a differently distributed matrix @p dest.
+     * The function also works for matrices with different process grids
+     * or block-cyclic distributions.
+     */
+    void
+    copy_to(ScaLAPACKMatrix<NumberType> &dest) const;
+
+
+    /**
+     * Complex conjugate.
+     */
+    void
+    conjugate();
+
+
+    /**
+     * The operations based on the input parameter @p transpose_B and the
+     * alignment conditions are summarized in the following table:
+     *
+     * | transpose_B |          Block Sizes         |                    Operation                  |
+     * | :---------: | :--------------------------: | :-------------------------------------------: |
+     * |   false     | $MB_A=MB_B$ <br> $NB_A=NB_B$ |  $\mathbf{A} = a \mathbf{A} + b \mathbf{B}$   |
+     * |   true      | $MB_A=NB_B$ <br> $NB_A=MB_B$ | $\mathbf{A} = a \mathbf{A} + b \mathbf{B}^T$  |
+     *
+     * The matrices $\mathbf{A}$ and $\mathbf{B}$ must have the same process
+     * grid.
+     */
+    void
+    add(const ScaLAPACKMatrix<NumberType> &B,
+        const NumberType                   a           = 0.,
+        const NumberType                   b           = 1.,
+        const bool                         transpose_B = false);
+
+
+    /**
+     * The operations based on the input parameter @p conjugate_transpose_B and the
+     * alignment conditions are summarized in the following table:
+     *
+     * | transpose_B |          Block Sizes         |                    Operation                  |
+     * | :---------: | :--------------------------: | :-------------------------------------------: |
+     * |   false     | $MB_A=MB_B$ <br> $NB_A=NB_B$ |  $\mathbf{A} = a \mathbf{A} + b \mathbf{B}$   |
+     * |   true      | $MB_A=NB_B$ <br> $NB_A=MB_B$ | $\mathbf{A} = a \mathbf{A} + b \mathbf{B}^C$  |
+     *
+     * The matrices $\mathbf{A}$ and $\mathbf{B}$ must have the same process
+     * grid.
+     */
+    void
+    zadd(const ScaLAPACKMatrix<NumberType> &B,
+         const NumberType                   a                     = 0.,
+         const NumberType                   b                     = 1.,
+         const bool                         conjugate_transpose_B = false);
+
+    /**
+     * Transposing assignment: $\mathbf{A} = \mathbf{B}^T$
+     *
+     * The matrices $\mathbf{A}$ and $\mathbf{B}$ must have the same process
+     * grid.
+     *
+     * The following alignment conditions have to be fulfilled: $MB_A=NB_B$ and
+     * $NB_A=MB_B$.
+     */
+    void
+    copy_transposed(const ScaLAPACKMatrix<NumberType> &B);
+
+
+    /**
+     * Transposing assignment: $\mathbf{A} = \mathbf{B}^C$
+     *
+     * The matrices $\mathbf{A}$ and $\mathbf{B}$ must have the same process
+     * grid.
+     *
+     * The following alignment conditions have to be fulfilled: $MB_A=NB_B$ and
+     * $NB_A=MB_B$.
+     */
+    void
+    copy_conjugate_transposed(const ScaLAPACKMatrix<NumberType> &B);
+
+    /**
+     * Matrix-matrix-multiplication:
+     *
+     * The operations based on the input parameters and the alignment conditions
+     * are summarized in the following table:
+     *
+     * | transpose_A | transpose_B |                  Block Sizes                  |                             Operation                           |
+     * | :---------: | :---------: | :-------------------------------------------: | :-------------------------------------------------------------: |
+     * | false       |   false     | $MB_A=MB_C$ <br> $NB_A=MB_B$ <br> $NB_B=NB_C$ |   $\mathbf{C} = b \mathbf{A} \cdot \mathbf{B} + c \mathbf{C}$   |
+     * | false       |   true      | $MB_A=MB_C$ <br> $NB_A=NB_B$ <br> $MB_B=NB_C$ |  $\mathbf{C} = b \mathbf{A} \cdot \mathbf{B}^T + c \mathbf{C}$  |
+     * | true        |   false     | $MB_A=MB_B$ <br> $NB_A=MB_C$ <br> $NB_B=NB_C$ | $\mathbf{C} = b \mathbf{A}^T \cdot \mathbf{B} + c \mathbf{C}$   |
+     * | true        |   true      | $MB_A=NB_B$ <br> $NB_A=MB_C$ <br> $MB_B=NB_C$ | $\mathbf{C} = b \mathbf{A}^T \cdot \mathbf{B}^T + c \mathbf{C}$ |
+     *
+     * It is assumed that $\mathbf{A}$ and $\mathbf{B}$ have compatible sizes
+     * and that
+     * $\mathbf{C}$ already has the right size.
+     *
+     * The matrices $\mathbf{A}$, $\mathbf{B}$ and $\mathbf{C}$ must have the
+     * same process grid.
+     */
+    void
+    mult(const NumberType                   b,
+         const ScaLAPACKMatrix<NumberType> &B,
+         const NumberType                   c,
+         ScaLAPACKMatrix<NumberType> &      C,
+         const bool                         transpose_A = false,
+         const bool                         transpose_B = false) const;
+
+
+    /**
+     * Matrix-matrix-multiplication:
+     *
+     * The operations based on the input parameters and the alignment conditions
+     * are summarized in the following table:
+     *
+     * | conjugate_transpose_A | conjugate_transpose_B |                  Block Sizes                  |                             Operation                           |
+     * | :---------: | :---------: | :-------------------------------------------: | :-------------------------------------------------------------: |
+     * | false       |   false     | $MB_A=MB_C$ <br> $NB_A=MB_B$ <br> $NB_B=NB_C$ |   $\mathbf{C} = b \mathbf{A} \cdot \mathbf{B} + c \mathbf{C}$   |
+     * | false       |   true      | $MB_A=MB_C$ <br> $NB_A=NB_B$ <br> $MB_B=NB_C$ |  $\mathbf{C} = b \mathbf{A} \cdot \mathbf{B}^C + c \mathbf{C}$  |
+     * | true        |   false     | $MB_A=MB_B$ <br> $NB_A=MB_C$ <br> $NB_B=NB_C$ | $\mathbf{C} = b \mathbf{A}^C \cdot \mathbf{B} + c \mathbf{C}$   |
+     * | true        |   true      | $MB_A=NB_B$ <br> $NB_A=MB_C$ <br> $MB_B=NB_C$ | $\mathbf{C} = b \mathbf{A}^C \cdot \mathbf{B}^C + c \mathbf{C}$ |
+     *
+     * It is assumed that $\mathbf{A}$ and $\mathbf{B}$ have compatible sizes
+     * and that
+     * $\mathbf{C}$ already has the right size.
+     *
+     * The matrices $\mathbf{A}$, $\mathbf{B}$ and $\mathbf{C}$ must have the
+     * same process grid.
+     */
+    void
+    zmult(const NumberType                   b,
+          const ScaLAPACKMatrix<NumberType> &B,
+          const NumberType                   c,
+          ScaLAPACKMatrix<NumberType> &      C,
+          const bool                         conjugate_transpose_A = false,
+          const bool conjugate_transpose_B = false) const;
+
+
+    /**
+     * Matrix-matrix-multiplication.
+     *
+     * The optional parameter @p adding determines whether the result is
+     * stored in $\mathbf{C}$ or added to $\mathbf{C}$.
+     *
+     * if (@p adding) $\mathbf{C} = \mathbf{C} + \mathbf{A} \cdot \mathbf{B}$
+     *
+     * else $\mathbf{C} = \mathbf{A} \cdot \mathbf{B}$
+     *
+     * It is assumed that $\mathbf{A}$ and $\mathbf{B}$ have compatible sizes
+     * and that
+     * $\mathbf{C}$ already has the right size.
+     *
+     * The following alignment conditions have to be fulfilled: $MB_A=MB_C$,
+     * $NB_A=MB_B$ and $NB_B=NB_C$.
+     */
+    void
+    mmult(ScaLAPACKMatrix<NumberType> &      C,
+          const ScaLAPACKMatrix<NumberType> &B,
+          const bool                         adding = false) const;
+
+    /**
+     * Matrix-matrix-multiplication using transpose of $\mathbf{A}$.
+     *
+     * The optional parameter @p adding determines whether the result is
+     * stored in $\mathbf{C}$ or added to $\mathbf{C}$.
+     *
+     * if (@p adding) $\mathbf{C} = \mathbf{C} + \mathbf{A}^T \cdot \mathbf{B}$
+     *
+     * else $\mathbf{C} = \mathbf{A}^T \cdot \mathbf{B}$
+     *
+     * It is assumed that $\mathbf{A}$ and $\mathbf{B}$ have compatible sizes
+     * and that
+     * $\mathbf{C}$ already has the right size.
+     *
+     * The following alignment conditions have to be fulfilled: $MB_A=MB_B$,
+     * $NB_A=MB_C$ and $NB_B=NB_C$.
+     */
+    void
+    Tmmult(ScaLAPACKMatrix<NumberType> &      C,
+           const ScaLAPACKMatrix<NumberType> &B,
+           const bool                         adding = false) const;
+
+    /**
+     * Matrix-matrix-multiplication using the transpose of $\mathbf{B}$.
+     *
+     * The optional parameter @p adding determines whether the result is
+     * stored in $\mathbf{C}$ or added to $\mathbf{C}$.
+     *
+     * if (@p adding) $\mathbf{C} = \mathbf{C} + \mathbf{A} \cdot \mathbf{B}^T$
+     *
+     * else $\mathbf{C} = \mathbf{A} \cdot \mathbf{B}^T$
+     *
+     * It is assumed that $\mathbf{A}$ and $\mathbf{B}$ have compatible sizes
+     * and that
+     * $\mathbf{C}$ already has the right size.
+     *
+     * The following alignment conditions have to be fulfilled: $MB_A=MB_C$,
+     * $NB_A=NB_B$ and $MB_B=NB_C$.
+     */
+    void
+    mTmult(ScaLAPACKMatrix<NumberType> &      C,
+           const ScaLAPACKMatrix<NumberType> &B,
+           const bool                         adding = false) const;
+
+    /**
+     * Matrix-matrix-multiplication using transpose of $\mathbf{A}$ and
+     * $\mathbf{B}$.
+     *
+     * The optional parameter @p adding determines whether the result is
+     * stored in $\mathbf{C}$ or added to $\mathbf{C}$.
+     *
+     * if (@p adding) $\mathbf{C} = \mathbf{C} + \mathbf{A}^T \cdot
+     * \mathbf{B}^T$
+     *
+     * else $\mathbf{C} = \mathbf{A}^T \cdot \mathbf{B}^T$
+     *
+     * It is assumed that $\mathbf{A}$ and $\mathbf{B}$ have compatible sizes
+     * and that
+     * $\mathbf{C}$ already has the right size.
+     *
+     * The following alignment conditions have to be fulfilled: $MB_A=NB_B$,
+     * $NB_A=MB_C$ and $MB_B=NB_C$.
+     */
+    void
+    TmTmult(ScaLAPACKMatrix<NumberType> &      C,
+            const ScaLAPACKMatrix<NumberType> &B,
+            const bool                         adding = false) const;
+
+
+    /**
+     * Matrix-matrix-multiplication.
+     *
+     * The optional parameter @p adding determines whether the result is
+     * stored in $\mathbf{C}$ or added to $\mathbf{C}$.
+     *
+     * if (@p adding) $\mathbf{C} = \mathbf{C} + \mathbf{A} \cdot \mathbf{B}$
+     *
+     * else $\mathbf{C} = \mathbf{A} \cdot \mathbf{B}$
+     *
+     * It is assumed that $\mathbf{A}$ and $\mathbf{B}$ have compatible sizes
+     * and that
+     * $\mathbf{C}$ already has the right size.
+     *
+     * The following alignment conditions have to be fulfilled: $MB_A=MB_C$,
+     * $NB_A=MB_B$ and $NB_B=NB_C$.
+     */
+    void
+    zmmult(ScaLAPACKMatrix<NumberType> &      C,
+           const ScaLAPACKMatrix<NumberType> &B,
+           const bool                         adding = false) const;
+
+    /**
+     * Matrix-matrix-multiplication using conjugate transpose of $\mathbf{A}$.
+     *
+     * The optional parameter @p adding determines whether the result is
+     * stored in $\mathbf{C}$ or added to $\mathbf{C}$.
+     *
+     * if (@p adding) $\mathbf{C} = \mathbf{C} + \mathbf{A}^C \cdot \mathbf{B}$
+     *
+     * else $\mathbf{C} = \mathbf{A}^C \cdot \mathbf{B}$
+     *
+     * It is assumed that $\mathbf{A}$ and $\mathbf{B}$ have compatible sizes
+     * and that
+     * $\mathbf{C}$ already has the right size.
+     *
+     * The following alignment conditions have to be fulfilled: $MB_A=MB_B$,
+     * $NB_A=MB_C$ and $NB_B=NB_C$.
+     */
+    void
+    zCmmult(ScaLAPACKMatrix<NumberType> &      C,
+            const ScaLAPACKMatrix<NumberType> &B,
+            const bool                         adding = false) const;
+
+    /**
+     * Matrix-matrix-multiplication using the conjugate transpose of
+     * $\mathbf{B}$.
+     *
+     * The optional parameter @p adding determines whether the result is
+     * stored in $\mathbf{C}$ or added to $\mathbf{C}$.
+     *
+     * if (@p adding) $\mathbf{C} = \mathbf{C} + \mathbf{A} \cdot \mathbf{B}^C$
+     *
+     * else $\mathbf{C} = \mathbf{A} \cdot \mathbf{B}^C$
+     *
+     * It is assumed that $\mathbf{A}$ and $\mathbf{B}$ have compatible sizes
+     * and that
+     * $\mathbf{C}$ already has the right size.
+     *
+     * The following alignment conditions have to be fulfilled: $MB_A=MB_C$,
+     * $NB_A=NB_B$ and $MB_B=NB_C$.
+     */
+    void
+    zmCmult(ScaLAPACKMatrix<NumberType> &      C,
+            const ScaLAPACKMatrix<NumberType> &B,
+            const bool                         adding = false) const;
+
+    /**
+     * Matrix-matrix-multiplication using conjugate transpose of $\mathbf{A}$
+     * and
+     * $\mathbf{B}$.
+     *
+     * The optional parameter @p adding determines whether the result is
+     * stored in $\mathbf{C}$ or added to $\mathbf{C}$.
+     *
+     * if (@p adding) $\mathbf{C} = \mathbf{C} + \mathbf{A}^C \cdot
+     * \mathbf{B}^T$
+     *
+     * else $\mathbf{C} = \mathbf{A}^C \cdot \mathbf{B}^C$
+     *
+     * It is assumed that $\mathbf{A}$ and $\mathbf{B}$ have compatible sizes
+     * and that
+     * $\mathbf{C}$ already has the right size.
+     *
+     * The following alignment conditions have to be fulfilled: $MB_A=NB_B$,
+     * $NB_A=MB_C$ and $MB_B=NB_C$.
+     */
+    void
+    zCmCmult(ScaLAPACKMatrix<NumberType> &      C,
+             const ScaLAPACKMatrix<NumberType> &B,
+             const bool                         adding = false) const;
+
+
+    /**
+     * Number of rows of the $M \times N$ matrix.
+     */
+    size_type
+    m() const;
+
+    /**
+     * Number of columns of the $M \times N$ matrix.
+     */
+    size_type
+    n() const;
+
+    /**
+     * Number of local rows on this MPI processes.
+     */
+    unsigned int
+    local_m() const;
+
+    /**
+     * Number of local columns on this MPI process.
+     */
+    unsigned int
+    local_n() const;
+
+    /**
+     * Return the global row number for the given local row @p loc_row .
+     */
+    unsigned int
+    global_row(const unsigned int loc_row) const;
+
+    /**
+     * Return the global column number for the given local column @p loc_column.
+     */
+    unsigned int
+    global_column(const unsigned int loc_column) const;
+
+    /**
+     * Read access to local element.
+     */
+    NumberType
+    local_el(const unsigned int loc_row, const unsigned int loc_column) const;
+
+    /**
+     * Write access to local element.
+     */
+    NumberType &
+    local_el(const unsigned int loc_row, const unsigned int loc_column);
+
+    /**
+     * Compute the Cholesky factorization of the matrix using ScaLAPACK
+     * function <code>pXpotrf</code>. The result of the factorization is stored
+     * in this object.
+     */
+    void
+    compute_cholesky_factorization();
+
+    /**
+     * Compute the LU factorization of the matrix using ScaLAPACK
+     * function <code>pXgetrf</code> and partial pivoting with row interchanges.
+     * The result of the factorization is stored in this object.
+     */
+    void
+    compute_lu_factorization();
+
+    /**
+     * Invert the matrix by first computing a Cholesky for hermitian matrices
+     * or a LU factorization for general matrices and then
+     * building the actual inverse using <code>pXpotri</code> or
+     * <code>pXgetri</code>. If the matrix is triangular, the LU factorization
+     * step is skipped, and <code>pXtrtri</code> is used directly.
+     *
+     * If a Cholesky or LU factorization has been applied previously,
+     * <code>pXpotri</code> or <code>pXgetri</code> are called directly.
+     *
+     * The inverse is stored in this object.
+     */
+    void
+    invert();
+
+    /**
+     * Computing selected eigenvalues and, optionally, the eigenvectors of the
+     * real hermitian matrix $\mathbf{A} \in \mathbb{R}^{M \times M}$.
+     *
+     * The eigenvalues/eigenvectors are selected by prescribing a range of indices @p index_limits.
+     *
+     * If successful, the computed eigenvalues are arranged in ascending order.
+     * The eigenvectors are stored in the columns of the matrix, thereby
+     * overwriting the original content of the matrix.
+     *
+     * If all eigenvalues/eigenvectors have to be computed, pass the closed interval $ \left[ 0, M-1 \right] $ in @p index_limits.
+     *
+     * Pass the closed interval $ \left[ M-r, M-1 \right] $ if the $r$ largest
+     * eigenvalues/eigenvectors are desired.
+     */
+    std::vector<double>
+    eigenpairs_hermitian_by_index(
+      const std::pair<unsigned int, unsigned int> &index_limits,
+      const bool                                   compute_eigenvectors);
+
+    /**
+     * Computing selected eigenvalues and, optionally, the eigenvectors of the
+     * real hermitian matrix $\mathbf{A} \in \mathbb{R}^{M \times M}$ using the
+     * MRRR algorithm.
+     *
+     * The eigenvalues/eigenvectors are selected by prescribing a range of indices @p index_limits.
+     *
+     * If successful, the computed eigenvalues are arranged in ascending order.
+     * The eigenvectors are stored in the columns of the matrix, thereby
+     * overwriting the original content of the matrix.
+     *
+     * If all eigenvalues/eigenvectors have to be computed, pass the closed interval $ \left[ 0, M-1 \right] $ in @p index_limits.
+     *
+     * Pass the closed interval $ \left[ M-r, M-1 \right] $ if the $r$ largest
+     * eigenvalues/eigenvectors are desired.
+     */
+    std::vector<double>
+    eigenpairs_hermitian_by_index_MRRR(
+      const std::pair<unsigned int, unsigned int> &index_limits,
+      const bool                                   compute_eigenvectors);
+
+  private:
+    /**
+     * Computing selected eigenvalues and, optionally, the eigenvectors.
+     * The eigenvalues/eigenvectors are selected by either prescribing a range of indices @p index_limits
+     * or a range of values @p value_limits for the eigenvalues. The function will throw an exception
+     * if both ranges are prescribed (meaning that both ranges differ from the
+     * default value) as this ambiguity is prohibited. If successful, the
+     * computed eigenvalues are arranged in ascending order. The eigenvectors
+     * are stored in the columns of the matrix, thereby overwriting the original
+     * content of the matrix.
+     */
+    std::vector<double>
+    eigenpairs_hermitian(
+      const bool                                   compute_eigenvectors,
+      const std::pair<unsigned int, unsigned int> &index_limits =
+        std::make_pair(dealii::numbers::invalid_unsigned_int,
+                       dealii::numbers::invalid_unsigned_int),
+      const std::pair<double, double> &value_limits =
+        std::make_pair(std::numeric_limits<double>::quiet_NaN(),
+                       std::numeric_limits<double>::quiet_NaN()));
+
+    /**
+     * Computing selected eigenvalues and, optionally, the eigenvectors of the
+     * real hermitian matrix $\mathbf{A} \in \mathbb{R}^{M \times M}$ using the
+     * MRRR algorithm.
+     * The eigenvalues/eigenvectors are selected by either prescribing a range of indices @p index_limits
+     * or a range of values @p value_limits for the eigenvalues. The function will throw an exception
+     * if both ranges are prescribed (meaning that both ranges differ from the
+     * default value) as this ambiguity is prohibited.
+     *
+     * By calling this function the original content of the matrix will be
+     * overwritten. If requested, the eigenvectors are stored in the columns of
+     * the matrix. Also in the case that just the eigenvalues are required, the
+     * content of the matrix will be overwritten.
+     *
+     * If successful, the computed eigenvalues are arranged in ascending order.
+     *
+     * @note Due to a bug in Netlib-ScaLAPACK, either all or no eigenvectors can be computed.
+     * Therefore, the input @p index_limits has to be set accordingly. Using Intel-MKL this restriction is not required.
+     */
+    std::vector<double>
+    eigenpairs_hermitian_MRRR(
+      const bool                                   compute_eigenvectors,
+      const std::pair<unsigned int, unsigned int> &index_limits =
+        std::make_pair(dealii::numbers::invalid_unsigned_int,
+                       dealii::numbers::invalid_unsigned_int),
+      const std::pair<double, double> &value_limits =
+        std::make_pair(std::numeric_limits<double>::quiet_NaN(),
+                       std::numeric_limits<double>::quiet_NaN()));
+
+
+    /**
+     * local storage
+     */
+    std::vector<NumberType> values;
+
+    /**
+     * Since ScaLAPACK operations notoriously change the meaning of the matrix
+     * entries, we record the current state after the last operation here.
+     */
+    dftefeScalapack::LAPACKSupport::State state;
+
+    /**
+     * Additional property of the matrix which may help to select more
+     * efficient ScaLAPACK functions.
+     */
+    dftefeScalapack::LAPACKSupport::Property property;
+
+    /**
+     * A shared pointer to a Utilities::MPI::ProcessGrid object which contains a
+     * BLACS context and a MPI communicator, as well as other necessary data
+     * structures.
+     */
+    std::shared_ptr<const dftefeScalapack::ProcessGrid> grid;
+
+    /**
+     * Number of rows in the matrix.
+     */
+    int n_rows;
+
+    /**
+     * Number of columns in the matrix.
+     */
+    int n_columns;
+
+    /**
+     * Row block size.
+     */
+    int row_block_size;
+
+    /**
+     * Column block size.
+     */
+    int column_block_size;
+
+    /**
+     * Number of rows in the matrix owned by the current process.
+     */
+    int n_local_rows;
+
+    /**
+     * Number of columns in the matrix owned by the current process.
+     */
+    int n_local_columns;
+
+    /**
+     * ScaLAPACK description vector.
+     */
+    int descriptor[9];
+
+    /**
+     * Workspace array.
+     */
+    mutable std::vector<NumberType> work;
+
+    /**
+     * Integer workspace array.
+     */
+    mutable std::vector<int> iwork;
+
+    /**
+     * Integer array holding pivoting information required
+     * by ScaLAPACK's matrix factorization routines.
+     */
+    std::vector<int> ipiv;
+
+    /**
+     * A character to define where elements are stored in case
+     * ScaLAPACK operations support this.
+     */
+    const char uplo;
+
+    /**
+     * The process row of the process grid over which the first row
+     * of the global matrix is distributed.
+     */
+    const int first_process_row;
+
+    /**
+     * The process column of the process grid over which the first column
+     * of the global matrix is distributed.
+     */
+    const int first_process_column;
+
+    /**
+     * Global row index that determines where to start a submatrix.
+     * Currently this equals unity, as we don't use submatrices.
+     */
+    const int submatrix_row;
+
+    /**
+     * Global column index that determines where to start a submatrix.
+     * Currently this equals unity, as we don't use submatrices.
+     */
+    const int submatrix_column;
+
+    /**
+     * Thread mutex.
+     */
+    mutable dealii::Threads::Mutex mutex;
+  };
+
+  // ----------------------- inline functions ----------------------------
+
+#ifndef DOXYGEN
+
+  template <typename NumberType>
+  inline NumberType
+  ScaLAPACKMatrix<NumberType>::local_el(const unsigned int loc_row,
+                                        const unsigned int loc_column) const
+  {
+    return values[loc_column * n_local_rows + loc_row];
+    // return (*this)(loc_row, loc_column);
+  }
+
+
+
+  template <typename NumberType>
+  inline NumberType &
+  ScaLAPACKMatrix<NumberType>::local_el(const unsigned int loc_row,
+                                        const unsigned int loc_column)
+  {
+    return values[loc_column * n_local_rows + loc_row];
+    // return (*this)(loc_row, loc_column);
+  }
+
+
+  template <typename NumberType>
+  inline unsigned int
+  ScaLAPACKMatrix<NumberType>::m() const
+  {
+    return n_rows;
+  }
+
+
+
+  template <typename NumberType>
+  inline unsigned int
+  ScaLAPACKMatrix<NumberType>::n() const
+  {
+    return n_columns;
+  }
+
+
+
+  template <typename NumberType>
+  unsigned int
+  ScaLAPACKMatrix<NumberType>::local_m() const
+  {
+    return n_local_rows;
+  }
+
+
+
+  template <typename NumberType>
+  unsigned int
+  ScaLAPACKMatrix<NumberType>::local_n() const
+  {
+    return n_local_columns;
+  }
+
+
+#endif // DOXYGEN
+
+
+} // namespace dftefeScalapack
+#endif // ScaLAPACKMatrix_H_
diff --git a/src/linearAlgebra/GeneralMatrix.h b/src/linearAlgebra/GeneralMatrix.h
index a632560de..aefc1d2ea 100644
--- a/src/linearAlgebra/GeneralMatrix.h
+++ b/src/linearAlgebra/GeneralMatrix.h
@@ -44,6 +44,90 @@ namespace dftefe
                     size_t   nb = global_nb,
                     size_t   mb = global_mb);
 
+      /**
+       * @brief The value setter interface for the matrix class. This setter
+       *        assumes the given pointer to ValueType to be a serially repeated
+       *        matrix and copy the data to the corresponding local owned
+       *        location trivially.
+       * @warning There is no boundary check in this function. The user should
+       *          be responsible that the size of data array should have d_m*d_n
+       *          size.
+       * @param data The pointer to the data to be copied from.
+       */
+      virtual void
+      setValues(const ValueType *data) override;
+
+      /**
+       * @brief The value setter interface for the matrix class. This setter
+       *        assumes the given pointer to ValueType to be a serially repeated
+       *        matrix and copy the data to the corresponding local owned
+       *        sub-matrix (i1:i2-1, j1:j2-1) trivially.
+       * @warning There is no boundary check in this function. The user should
+       *          be responsible that the size of data array should have
+       *          (j2-j1)*(i2-i1) size.
+       * @param i1 the global index of the first row of the submatrix.
+       * @param i2 one passes the global index of the last row of the submatrix.
+       * @param j1 the global index of the first column of the submatrix.
+       * @param j2 one passes the global index of the last column of the
+       *           submatrix.
+       * @param data The pointer to the data to be copied from. The user should
+       *             be responsible that the size of data array should have
+       *             (j2-j1)*(i2-i1) size.
+       */
+      virtual void
+      setValues(size_t           i1,
+                size_t           i2,
+                size_t           j1,
+                size_t           j2,
+                const ValueType *data) override;
+
+      //      /**
+      //       * @brief The value setter interface for the matrix class. This setter
+      //       *        allows data to be different on each processor. The
+      //       locally owned
+      //       *        data contains two parts: (1) values belongs to the
+      //       locally owned
+      //       *        portion of the matrix and (2) values belong to the
+      //       off-processor
+      //       *        portion of the matrix. This setter will distribute the
+      //       *        off-processor values correspondingly.
+      //       * @warning This routine currently is not optimized and fully
+      //       *          tested. Using this routing to assign values can
+      //       *          deteriorate the performance dramatically. This should
+      //       *          only be used for debugging purpose or not performance
+      //       *          relevant part of the code.
+      //       * @param i1 the global index of the first row of the submatrix.
+      //       * @param i2 one passes the global index of the last row of the submatrix.
+      //       * @param j1 the global index of the first column of the submatrix.
+      //       * @param j2 one passes the global index of the last column of the
+      //       *           submatrix.
+      //       * @param data The pointer to the data to be copied from. The user should
+      //       *             be responsible that the size of data array should
+      //       have
+      //       *             (j2-j1)*(i2-i1) size.
+      //       */
+      //      virtual void
+      //      setDistributedValues(size_t           i1,
+      //                           size_t           i2,
+      //                           size_t           j1,
+      //                           size_t           j2,
+      //                           const ValueType *data) = 0;
+      //
+      //      /**
+      //       * @brief The value setter interface for inserting a single value.
+      //       * @warning This routine is sub-optimal and can seriously deteriorate the
+      //       *          the performance. It should be only used when
+      //       necessary. Only
+      //       *          the processor which owns (i, j) element will be
+      //       inserting
+      //       *          value.
+      //       * @param i the row index of the value.
+      //       * @param j the column index of the value.
+      //       * @param d the value.
+      //       */
+      //      virtual void
+      //      setValue(size_t i, size_t j, ValueType d) = 0;
+
       slate::Matrix<ValueType> &
       getSlateMatrix() const;
 
diff --git a/src/linearAlgebra/GeneralMatrix.t.cpp b/src/linearAlgebra/GeneralMatrix.t.cpp
index c94ebe68b..731340774 100644
--- a/src/linearAlgebra/GeneralMatrix.t.cpp
+++ b/src/linearAlgebra/GeneralMatrix.t.cpp
@@ -49,12 +49,35 @@ namespace dftefe
         {
           d_matrix->insertLocalTiles(slate::Target::Host);
         }
+      AbstractMatrix<ValueType, memorySpace>::initSlateMatrix(d_matrix);
     }
+
+
     template <typename ValueType, dftefe::utils::MemorySpace memorySpace>
     slate::Matrix<ValueType> &
     GeneralMatrix<ValueType, memorySpace>::getSlateMatrix() const
     {
-      return d_matrix;
+      return *d_matrix;
+    }
+
+    template <typename ValueType, dftefe::utils::MemorySpace memorySpace>
+    void
+    GeneralMatrix<ValueType, memorySpace>::setValues(const ValueType *data)
+    {
+      AbstractMatrix<ValueType, memorySpace>::setValueSlateMatrix(d_baseMatrix,
+                                                                  data);
+    }
+
+    template <typename ValueType, dftefe::utils::MemorySpace memorySpace>
+    void
+    GeneralMatrix<ValueType, memorySpace>::setValues(size_t           i1,
+                                                     size_t           i2,
+                                                     size_t           j1,
+                                                     size_t           j2,
+                                                     const ValueType *data)
+    {
+      slate::Matrix<ValueType> mat = d_matrix->sub(i1, i2, j1, j2);
+      AbstractMatrix<ValueType, memorySpace>::setValueSlateMatrix(&mat, data);
     }
   } // namespace linearAlgebra
 } // namespace dftefe
diff --git a/src/linearAlgebra/HermitianMatrix.h b/src/linearAlgebra/HermitianMatrix.h
index d04253a57..6b3522674 100644
--- a/src/linearAlgebra/HermitianMatrix.h
+++ b/src/linearAlgebra/HermitianMatrix.h
@@ -43,6 +43,43 @@ namespace dftefe
                       size_t   q,
                       size_t   nb = global_nb);
 
+      /**
+       * @brief The value setter interface for the matrix class. This setter
+       *        assumes the given pointer to ValueType to be a serially repeated
+       *        matrix and copy the data to the corresponding local owned
+       *        location trivially.
+       * @warning There is no boundary check in this function. The user should
+       *          be responsible that the size of data array should have d_m*d_n
+       *          size.
+       * @param data The pointer to the data to be copied from.
+       */
+      virtual void
+      setValues(const ValueType *data) override;
+
+      /**
+       * @brief The value setter interface for the matrix class. This setter
+       *        assumes the given pointer to ValueType to be a serially repeated
+       *        matrix and copy the data to the corresponding local owned
+       *        sub-matrix (i1:i2-1, j1:j2-1) trivially.
+       * @warning There is no boundary check in this function. The user should
+       *          be responsible that the size of data array should have
+       *          (j2-j1)*(i2-i1) size.
+       * @param i1 the global index of the first row of the submatrix.
+       * @param i2 one passes the global index of the last row of the submatrix.
+       * @param j1 the global index of the first column of the submatrix.
+       * @param j2 one passes the global index of the last column of the
+       *           submatrix.
+       * @param data The pointer to the data to be copied from. The user should
+       *             be responsible that the size of data array should have
+       *             (j2-j1)*(i2-i1) size.
+       */
+      virtual void
+      setValues(size_t           i1,
+                size_t           i2,
+                size_t           j1,
+                size_t           j2,
+                const ValueType *data) override;
+
       slate::HermitianMatrix<ValueType> &
       getSlateMatrix() const;
 
diff --git a/src/linearAlgebra/HermitianMatrix.t.cpp b/src/linearAlgebra/HermitianMatrix.t.cpp
index 87549d9b4..0e66592f3 100644
--- a/src/linearAlgebra/HermitianMatrix.t.cpp
+++ b/src/linearAlgebra/HermitianMatrix.t.cpp
@@ -48,12 +48,30 @@ namespace dftefe
         {
           d_matrix->insertLocalTiles(slate::Target::Host);
         }
+      AbstractMatrix<ValueType, memorySpace>::initSlateMatrix(d_matrix);
     }
     template <typename ValueType, dftefe::utils::MemorySpace memorySpace>
     slate::HermitianMatrix<ValueType> &
     HermitianMatrix<ValueType, memorySpace>::getSlateMatrix() const
     {
-      return d_matrix;
+      return *d_matrix;
     }
+
+    template <typename ValueType, dftefe::utils::MemorySpace memorySpace>
+    void
+    HermitianMatrix<ValueType, memorySpace>::setValues(const ValueType *data)
+    {
+      AbstractMatrix<ValueType, memorySpace>::setValueSlateMatrix(d_baseMatrix,
+                                                                  data);
+    }
+
+    template <typename ValueType, dftefe::utils::MemorySpace memorySpace>
+    void
+    HermitianMatrix<ValueType, memorySpace>::setValues(size_t           i1,
+                                                       size_t           i2,
+                                                       size_t           j1,
+                                                       size_t           j2,
+                                                       const ValueType *data)
+    {}
   } // namespace linearAlgebra
 } // namespace dftefe
diff --git a/src/linearAlgebra/DistributedDenseMatrix.cpp b/src/linearAlgebra/MatVecOperations.cpp
similarity index 95%
rename from src/linearAlgebra/DistributedDenseMatrix.cpp
rename to src/linearAlgebra/MatVecOperations.cpp
index fcc3ff860..b18012fa0 100644
--- a/src/linearAlgebra/DistributedDenseMatrix.cpp
+++ b/src/linearAlgebra/MatVecOperations.cpp
@@ -20,10 +20,7 @@
  ******************************************************************************/
 
 /*
- * @author Ian C. Lin.
+ * @author Sambit Das.
  */
 
-#include "DistributedDenseMatrix.h"
-
-namespace dftefe
-{}
+#include "MatVecOperations.h"
diff --git a/src/linearAlgebra/MatVecOperations.h b/src/linearAlgebra/MatVecOperations.h
new file mode 100644
index 000000000..d575f7746
--- /dev/null
+++ b/src/linearAlgebra/MatVecOperations.h
@@ -0,0 +1,136 @@
+/******************************************************************************
+ * Copyright (c) 2021.                                                        *
+ * The Regents of the University of Michigan and DFT-EFE developers.          *
+ *                                                                            *
+ * This file is part of the DFT-EFE code.                                     *
+ *                                                                            *
+ * DFT-EFE is free software: you can redistribute it and/or modify            *
+ *   it under the terms of the Lesser GNU General Public License as           *
+ *   published by the Free Software Foundation, either version 3 of           *
+ *   the License, or (at your option) any later version.                      *
+ *                                                                            *
+ * DFT-EFE is distributed in the hope that it will be useful, but             *
+ *   WITHOUT ANY WARRANTY; without even the implied warranty                  *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                     *
+ *   See the Lesser GNU General Public License for more details.              *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public           *
+ *   License at the top level of DFT-EFE distribution.  If not, see           *
+ *   <https://www.gnu.org/licenses/>.                                         *
+ ******************************************************************************/
+
+/*
+ * @author Sambit Das
+ */
+
+#ifndef dftefeMatVecOperations_h
+#define dftefeMatVecOperations_h
+
+#include <memory>
+#include <blas.hh>
+#include <linearAlgebra/BlasLapackTypedef.h>
+#include <linearAlgebra/BlasLapack.h>
+#include <linearAlgebra/MultiVector.h>
+#include <linearAlgebra/Matrix.h>
+#include <utils/MemorySpaceType.h>
+
+namespace dftefe
+{
+  namespace linearAlgebra
+  {
+    namespace MatVecOperations
+    {
+      /**
+       * @brief Compute \f$ {\bf S}= {\bf A}^{\dagger} {\bf A} \f$ where
+       * \f$ {\bf A} \f$ is a dense matrix of size M times N with distributed
+       * memory parallelization over the rows. \f$ {\bf A} \f$ is stored
+       * as a MultiVector with number of vectors to be N and row major
+       * storage. \f$ {\bf S} \f$ is the N times N Hermitian overlap matrix
+       * stored in block-cyclic format. The parallelization of \f$ {\bf S} \f$
+       * is independent of that of \f$ {\bf A} \f$
+       * @param[in] A the dense matrix stored as a MultiVector
+       * @param[in,out] S preallocated Hermitian overlap matrix
+       * @param[in] context LinAlg context
+       * @param[in] NBlockSize determines the block size for
+       * blocked loop over the N vectors during the computation
+       * of \f$ {\bf S} \f$, for default value of 0 it is heuristically
+       * determined. This aspect to required for reducing the local MPI task
+       * peak memory scaling to \f$ \approx \f$ N times vectorsBlockSize
+       * instead of N times N
+       */
+      template <typename ValueType, dftefe::utils::MemorySpace memorySpace>
+      void
+      computeAMultiVecConjTransTimesAMultiVec(
+        const MultiVector<ValueType, memorySpace> &A,
+        HermitianMatrix<ValueType, memorySpace> &  S,
+        LinAlgOpContext<memorySpace> &             context,
+        const size_type                            NBlockSize = 0);
+
+      /**
+       * @brief In-place computation of \f$ {\bf A}= {\bf A} {\bf T} \f$ where
+       * \f$ {\bf A} \f$ is a dense matrix of size M times N with distributed
+       * memory parallelization over the rows. \f$ {\bf A} \f$ is stored
+       * as a MultiVector with number of vectors to be N and row major
+       * storage. \f$ {\bf T} \f$ is either an upper or lower triangular
+       * N times N matrix stored in block cyclic format. The parallelization
+       * of \f$ {\bf T} \f$ is independent of that of \f$ {\bf A} \f$
+       * @param[in,out] A the dense matrix stored as a MultiVector
+       * @param[in] T preallocated triangular matrix
+       * @param[in] context LinAlg context
+       * @param[in] MBlockSize determines the block size for
+       * blocked loop over the M vector size during the in-place computation.
+       * Default value of 0 sets block size heuristically.
+       * @param[in] NBlockSize determines the block size for
+       * blocked loop over the N vectors. Default value of 0 sets
+       * block size heuristically.
+       */
+      template <typename ValueType, dftefe::utils::MemorySpace memorySpace>
+      void
+      computeAMultiVecTimesTriangularMat(
+        MultiVector<ValueType, memorySpace> &           A,
+        const TriangularMatrix<ValueType, memorySpace> &T,
+        LinAlgOpContext<memorySpace> &                  context,
+        const size_type                                 MBlockSize = 0,
+        const size_type                                 NBlockSize = 0);
+
+
+      /**
+       * @brief In-place computation of \f$ {\bf A}= {\bf A} {\bf G} \f$ where
+       * \f$ {\bf A} \f$ is a dense matrix of size M times N with distributed
+       * memory parallelization over the rows. \f$ {\bf A} \f$ is stored
+       * as a MultiVector with number of vectors to be N and row major
+       * storage. \f$ {\bf G} \f$ is a N times N general matrix
+       * stored in block cyclic format. The parallelization
+       * of \f$ {\bf G} \f$ is independent of that of \f$ {\bf A} \f$
+       * @param[in,out] A the dense matrix stored as a MultiVector
+       * @param[in] G preallocated general matrix
+       * @param[in] context LinAlg context
+       * @param[in] MBlockSize determines the block size for
+       * blocked loop over the M vector size during the in-place computation
+       * Default value of 0 sets block size heuristically.
+       * @param[in] NBlockSize determines the block size for
+       * blocked loop over the N vectors. Default value of 0 sets block size
+       * heuristically.
+       */
+      template <typename ValueType, dftefe::utils::MemorySpace memorySpace>
+      void
+      computeAMultiVecTimesGeneralMat(
+        MultiVector<ValueType, memorySpace> &        A,
+        const GeneralMatrix<ValueType, memorySpace> &G,
+        LinAlgOpContext<memorySpace> &               context,
+        const size_type                              MBlockSize = 0,
+        const size_type                              NBlockSize = 0);
+
+
+      template <typename ValueType, dftefe::utils::MemorySpace memorySpace>
+      void
+      choleskyOrthogonalization(MultiVector<ValueType, memorySpace> &A,
+                                LinAlgOpContext<memorySpace> &       context);
+
+    } // namespace MatVecOperations
+  }   // namespace linearAlgebra
+
+} // namespace dftefe
+
+#include "MatVecOperations.t.cpp"
+#endif // dftefeMatVecOperations_h
diff --git a/src/linearAlgebra/MatVecOperations.t.cpp b/src/linearAlgebra/MatVecOperations.t.cpp
new file mode 100644
index 000000000..a3d6a6332
--- /dev/null
+++ b/src/linearAlgebra/MatVecOperations.t.cpp
@@ -0,0 +1,143 @@
+/******************************************************************************
+ * Copyright (c) 2021.                                                        *
+ * The Regents of the University of Michigan and DFT-EFE developers.          *
+ *                                                                            *
+ * This file is part of the DFT-EFE code.                                     *
+ *                                                                            *
+ * DFT-EFE is free software: you can redistribute it and/or modify            *
+ *   it under the terms of the Lesser GNU General Public License as           *
+ *   published by the Free Software Foundation, either version 3 of           *
+ *   the License, or (at your option) any later version.                      *
+ *                                                                            *
+ * DFT-EFE is distributed in the hope that it will be useful, but             *
+ *   WITHOUT ANY WARRANTY; without even the implied warranty                  *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                     *
+ *   See the Lesser GNU General Public License for more details.              *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public           *
+ *   License at the top level of DFT-EFE distribution.  If not, see           *
+ *   <https://www.gnu.org/licenses/>.                                         *
+ ******************************************************************************/
+
+/*
+ * @author Sambit Das
+ */
+
+#include <utils/DataTypeOverloads.h>
+
+namespace dftefe
+{
+  namespace linearAlgebra
+  {
+    template <typename ValueType, dftefe::utils::MemorySpace memorySpace>
+    void
+    MatVecOperations::computeAMultiVecConjTransTimesAMultiVec(
+      const MultiVector<ValueType, memorySpace> &A,
+      HermitianMatrix<ValueType, memorySpace> &  S,
+      LinAlgOpContext<memorySpace> &             context,
+      const size_type                            NBlockSize)
+    {
+      const size_type N = A.numVectors();
+
+      const size_type maxEntriesInBlock =
+        40000 * 400; // about 250 MB in FP64 datatype
+      const size_type NBlockSizeUsed =
+        NBlockSize == 0 ? (maxEntriesInBlock / N) : NBlockSize;
+
+      dftefe::utils::MemoryStorage<ValueType, memorySpace> overlapMatrixBlock(
+        N * NBlockSizeUsed, 0);
+
+      for (size_type ivec = 0; ivec < N; ivec += NBlockSizeUsed)
+        {
+          // Correct block dimensions if block "goes off edge of" the matrix
+          const size_type B = std::min(NBlockSizeUsed, N - ivec);
+
+          const blasLapack::Op
+            transA = blasLapack::Op::NoTrans,
+            transB = std::is_same<ValueType, std::complex<double>>::value ?
+                       blasLapack::Op::ConjTrans :
+                       blasLapack::Op::Trans;
+          const ValueType scalarCoeffAlpha = 1.0, scalarCoeffBeta = 0.0;
+
+          // member function to be created
+          overlapMatrixBlock = 0.;
+
+          const unsigned int D = N - ivec;
+
+          // Block1: (0,M-1,ivec-1,N-1)
+          // Block2: (0,M-1,ivec-1,ivec+B-1)
+          // Comptute local ABlock1^{Trans}*ABlock2Conj.
+          blasLapack::gemm<ValueType, memorySpace>(
+            blasLapack::Layout::ColMajor,
+            transA,
+            transB,
+            D,
+            B,
+            A.getMPIPatternP2P()->localOwnedSize(),
+            scalarCoeffAlpha,
+            A.data() + ivec,
+            N,
+            A.data() + ivec,
+            N,
+            scalarCoeffBeta,
+            overlapMatrixBlock.data(),
+            D,
+            context);
+
+
+          // Sum local ABlock1^{Trans}*ABlock2Conj across MPI tasks
+          utils::mpi::MPIAllreduce<memorySpace>(
+            MPI_IN_PLACE,
+            overlapMatrixBlock.data(),
+            D * B * sizeof(ValueType),
+            utils::mpi::MPIByte,
+            utils::mpi::MPISum,
+            A.getMPIPatternP2P()->mpiCommunicator());
+
+
+          // Copy only the lower/upper triangular part to the Hermitian
+          // overlap matrix, the matrix object already assumed to be created
+          // outside this function
+          // FIXME:The setValues function should work for the case where
+          // the Hermitian matrix is actually storing upper triangular values
+          // and the setValues function tries to set for the lower triangular
+          // part
+          S.setValues(
+            ivec - 1, N - 1, ivec - 1, ivec + B - 1, overlapMatrixBlock.data());
+        }
+
+      // FIXME: to be implemented (not complex conjugate transpose)
+      // S.complexConjugate();
+    }
+
+
+    template <typename ValueType, dftefe::utils::MemorySpace memorySpace>
+    void
+    computeAMultiVecTimesTriangularMat(
+      MultiVector<ValueType, memorySpace> &           A,
+      const TriangularMatrix<ValueType, memorySpace> &T,
+      LinAlgOpContext<memorySpace> &                  context,
+      const size_type                                 MBlockSize = 0,
+      const size_type                                 NBlockSize = 0)
+    {}
+
+
+    template <typename ValueType, dftefe::utils::MemorySpace memorySpace>
+    void
+    computeAMultiVecTimesGeneralMat(
+      MultiVector<ValueType, memorySpace> &        A,
+      const GeneralMatrix<ValueType, memorySpace> &T,
+      LinAlgOpContext<memorySpace> &               context,
+      const size_type                              MBlockSize = 0,
+      const size_type                              NBlockSize = 0)
+    {}
+
+
+    template <typename ValueType, dftefe::utils::MemorySpace memorySpace>
+    void
+    MatVecOperations::choleskyOrthogonalization(
+      MultiVector<ValueType, memorySpace> &A,
+      LinAlgOpContext<memorySpace> &       context)
+    {}
+  } // namespace linearAlgebra
+} // namespace dftefe
diff --git a/src/linearAlgebra/MatrixOperations.t.cpp b/src/linearAlgebra/MatrixOperations.t.cpp
index 0a642ee34..deabe899f 100644
--- a/src/linearAlgebra/MatrixOperations.t.cpp
+++ b/src/linearAlgebra/MatrixOperations.t.cpp
@@ -37,22 +37,28 @@ namespace dftefe
                                ValueType                              beta,
                                GeneralMatrix<ValueType, memorySpace> &C)
     {
-      if (memorySpace == dftefe::utils::MemorySpace::DEVICE)
-        {
-          slate::multiply(alpha,
-                          A.getSlateMatrix(),
-                          B.getSlateMatrix(),
-                          beta,
-                          {slate::Option::Target, slate::Target::Devices});
-        }
-      else
-        {
-          slate::multiply(alpha,
-                          A.getSlateMatrix(),
-                          B.getSlateMatrix(),
-                          beta,
-                          {slate::Option::Target, slate::Target::HostTask});
-        }
+      //      if (memorySpace == dftefe::utils::MemorySpace::DEVICE)
+      //        {
+      //          slate::multiply(alpha,
+      //                          A.getSlateMatrix(),
+      //                          B.getSlateMatrix(),
+      //                          beta,
+      //                          {slate::Option::Target,
+      //                          slate::Target::Devices});
+      //        }
+      //      else
+      //        {
+      //          slate::multiply(alpha,
+      //                          A.getSlateMatrix(),
+      //                          B.getSlateMatrix(),
+      //                          beta,
+      //                          {slate::Option::Target, slate::Target::Host});
+      slate::multiply(alpha,
+                      A.getSlateMatrix(),
+                      B.getSlateMatrix(),
+                      beta,
+                      C.getSlateMatrix());
+      //        }
     }
 
 
@@ -78,7 +84,7 @@ namespace dftefe
                           A.getSlateMatrix(),
                           B.getSlateMatrix(),
                           beta,
-                          {slate::Option::Target, slate::Target::HostTask});
+                          {slate::Option::Target, slate::Target::Host});
         }
     }
 
@@ -106,7 +112,7 @@ namespace dftefe
                           A.getSlateMatrix(),
                           B.getSlateMatrix(),
                           beta,
-                          {slate::Option::Target, slate::Target::HostTask});
+                          {slate::Option::Target, slate::Target::Host});
         }
     }
 
@@ -127,7 +133,7 @@ namespace dftefe
         {
           slate::multiply(A.getSlateMatrix(),
                           B.getSlateMatrix(),
-                          {slate::Option::Target, slate::Target::HostTask});
+                          {slate::Option::Target, slate::Target::Host});
         }
     }
 
@@ -149,7 +155,7 @@ namespace dftefe
         {
           slate::multiply(A.getSlateMatrix(),
                           B.getSlateMatrix(),
-                          {slate::Option::Target, slate::Target::HostTask});
+                          {slate::Option::Target, slate::Target::Host});
         }
     }
 
@@ -171,8 +177,7 @@ namespace dftefe
         {
           slate::triangular_solve(A.getSlateMatrix(),
                                   B.getSlateMatrix(),
-                                  {slate::Option::Target,
-                                   slate::Target::HostTask});
+                                  {slate::Option::Target, slate::Target::Host});
         }
     }
 
@@ -195,8 +200,7 @@ namespace dftefe
         {
           slate::triangular_solve(A.getSlateMatrix(),
                                   B.getSlateMatrix(),
-                                  {slate::Option::Target,
-                                   slate::Target::HostTask});
+                                  {slate::Option::Target, slate::Target::Host});
         }
     }
 
diff --git a/src/linearAlgebra/DistributedDenseMatrix.h b/src/linearAlgebra/MatrixProperties.h
similarity index 83%
rename from src/linearAlgebra/DistributedDenseMatrix.h
rename to src/linearAlgebra/MatrixProperties.h
index c447479e5..7f1682a60 100644
--- a/src/linearAlgebra/DistributedDenseMatrix.h
+++ b/src/linearAlgebra/MatrixProperties.h
@@ -23,21 +23,15 @@
  * @author Ian C. Lin.
  */
 
-#ifndef dftefeDistributedDenseMatrix_h
-#define dftefeDistributedDenseMatrix_h
-
-#include "Matrix.h"
+#ifndef dftefeMatrixProperties_h
+#define dftefeMatrixProperties_h
 
 namespace dftefe
 {
   namespace linearAlgebra
   {
-    template <typename ValueType, dftefe::utils::MemorySpace memorySpace>
-    class DistributedDenseMatrix : protected Matrix<ValueType, memorySpace>
-    {};
-  } // namespace linearAlgebra
+    enum class Uplo
+  }
 } // namespace dftefe
 
-
-
-#endif // dftefeDistributedDenseMatrix_h
+#endif // dftefeMatrixProperties_h
diff --git a/src/main.cpp b/src/main.cpp
index a62c06021..7981d13bc 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -10,7 +10,8 @@
 #include <iostream>
 #include <stdio.h>
 
-
+#include <slate/slate.hh>
+#include <slate/print.hh>
 #ifdef DFTEFE_WITH_MPI
 #  include <mpi.h>
 #endif
@@ -19,6 +20,9 @@
 #  include <utils/DeviceUtils.h>
 #endif
 
+#include <linearAlgebra/GeneralMatrix.h>
+#include <linearAlgebra/MatrixOperations.h>
+
 int
 main(int argc, char **argv)
 {
@@ -53,4 +57,51 @@ main(int argc, char **argv)
     }
   printf("This is gpu code");
 #endif
+
+  std::vector<double> dA(25, 0.0), dB(15, 0.0);
+  for (int i = 0; i < 25; ++i)
+    dA[i] = i + 1;
+  for (int i = 0; i < 15; ++i)
+    dB[i] = i + 25;
+
+  //    dftefe::linearAlgebra::HermitianMatrix<double,
+  //    dftefe::utils::MemorySpace::HOST>
+  //      A(dftefe::linearAlgebra::Uplo::Lower, 5, MPI_COMM_SELF, 1, 1);
+
+  dftefe::linearAlgebra::GeneralMatrix<double, dftefe::utils::MemorySpace::HOST>
+    A(5, 5, MPI_COMM_SELF, 1, 1, 5, 5);
+  //    B(5, 3, MPI_COMM_SELF, 1, 1, 3, 5),
+  //    C(5, 3, MPI_COMM_SELF, 1, 1, 3, 5);
+  // A.setValues(dA.data());
+  //  A.setValues(1, 4, 2, 3, dA.data());
+  //  B.setValues(dB.data());
+  slate::print("matrix A", A.getSlateMatrix());
+  //  slate::print("matrix B", B.getSlateMatrix());
+  //  dftefe::linearAlgebra::MatrixOperations::multiply<double,
+  //  dftefe::utils::MemorySpace::HOST>(1.0, A, B, 0.0, C);
+
+  //  blas::gemm(blas::Layout::ColMajor, blas::Op::NoTrans, blas::Op::NoTrans,
+  //  5, 3, 5, 1.0, A.getSlateMatrix()(0,0).data(), 5,
+  //  B.getSlateMatrix()(0,0).data(), 5, 0.0, C.getSlateMatrix()(0,0).data(),
+  //  5); slate::print("matrix C", C.getSlateMatrix());
+
+  //  std::vector<double> dD = {1.148258296912840, 1.001372859769502,
+  //  0.994403249881076, 1.638906883577435, 1.316661477930142, 1.001372859769502,
+  //  2.019228542698278, 1.612894116737191, 1.932571381399861, 2.047167619624847,
+  //  0.994403249881076, 1.612894116737191, 2.790965744547513, 2.622748446448737,
+  //  2.828958221480860, 1.638906883577435, 1.932571381399861, 2.622748446448737,
+  //  3.486600154718786, 3.283956672039348, 1.316661477930142, 2.047167619624847,
+  //  2.828958221480860, 3.283956672039348, 3.352644057275798};
+  //    dftefe::linearAlgebra::GeneralMatrix<double,
+  //    dftefe::utils::MemorySpace::HOST>
+  //      D(5, 5, MPI_COMM_SELF, 1, 1, 5, 5), Z(5, 5, MPI_COMM_SELF, 1, 1, 5,
+  //      5);
+  //    D.setValues(dD.data());
+  //    slate::print("matrix", D.getSlateMatrix());
+  //
+  //    slate::SymmetricMatrix<double> Dh(slate::Uplo::Lower,
+  //    D.getSlateMatrix()); slate::print("matrix", Dh);
+  //    lapack::syev(lapack::Job::Vec, lapack::Uplo::Lower, 5, Dh(0,0).data(),
+  //    5, Z.getSlateMatrix()(0,0).data()); slate::print("matrix",
+  //    Z.getSlateMatrix()); slate::print("matrix", Dh);
 }