diff --git a/.DS_Store b/.DS_Store deleted file mode 100644 index b21cdc40..00000000 Binary files a/.DS_Store and /dev/null differ diff --git a/CMakeLists.txt b/CMakeLists.txt index ff3d9a31..13c0e7c9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -23,13 +23,6 @@ if(${CMAKE_CXX_COMPILER_ID} STREQUAL "MSVC") message(FATAL_ERROR "You cannot build ASSET with the Microsoft Compiler. Please use Clang or GCC and try again.") endif() -if(NOT WIN32) - if(${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang") - set(CMAKE_EXE_LINKER_FLAGS "-fuse-ld=lld") - endif() -endif() - - ################################################################################ ############### Build Settings ################################################# ################################################################################ @@ -57,27 +50,40 @@ list(APPEND RELEASE_FLAGS "-O2") ## Generic Binary Flags if(BUILD_ASSET_WHEEL) - #list(APPEND RELEASE_FLAGS "-mcx16;-mpopcnt;-msse3;-msse4.1;-msse4.2;-mssse3;-mavx;-mavx2;-mbmi;-mbmi2;-mf16c;-mfma;-mlzcnt;-mmovbe;-mxsave") - # x86-64-v3 flags written out for older compilers - list(APPEND RELEASE_FLAGS "-mcx16") - list(APPEND RELEASE_FLAGS "-mpopcnt") - list(APPEND RELEASE_FLAGS "-msse3") - list(APPEND RELEASE_FLAGS "-msse4.1") - list(APPEND RELEASE_FLAGS "-msse4.2") - list(APPEND RELEASE_FLAGS "-mssse3") - list(APPEND RELEASE_FLAGS "-mavx") - list(APPEND RELEASE_FLAGS "-mavx2") - list(APPEND RELEASE_FLAGS "-mbmi") - list(APPEND RELEASE_FLAGS "-mbmi2") - list(APPEND RELEASE_FLAGS "-mf16c") - list(APPEND RELEASE_FLAGS "-mfma") - list(APPEND RELEASE_FLAGS "-mlzcnt") - list(APPEND RELEASE_FLAGS "-mmovbe") - list(APPEND RELEASE_FLAGS "-mxsave") + # Detect architecture for wheel builds + if(CMAKE_SYSTEM_PROCESSOR MATCHES "arm64|aarch64|ARM64") + # ARM64 (Apple Silicon, ARM servers) - use NEON + list(APPEND RELEASE_FLAGS "-march=armv8-a+simd") + message(STATUS "Building ARM64 wheel with NEON support") + elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|AMD64") + # x86-64 (Intel/AMD) - use AVX2 (x86-64-v3 flags written out for older compilers) + list(APPEND RELEASE_FLAGS "-mcx16") + list(APPEND RELEASE_FLAGS "-mpopcnt") + list(APPEND RELEASE_FLAGS "-msse3") + list(APPEND RELEASE_FLAGS "-msse4.1") + list(APPEND RELEASE_FLAGS "-msse4.2") + list(APPEND RELEASE_FLAGS "-mssse3") + list(APPEND RELEASE_FLAGS "-mavx") + list(APPEND RELEASE_FLAGS "-mavx2") + list(APPEND RELEASE_FLAGS "-mbmi") + list(APPEND RELEASE_FLAGS "-mbmi2") + list(APPEND RELEASE_FLAGS "-mf16c") + list(APPEND RELEASE_FLAGS "-mfma") + list(APPEND RELEASE_FLAGS "-mlzcnt") + list(APPEND RELEASE_FLAGS "-mmovbe") + list(APPEND RELEASE_FLAGS "-mxsave") + message(STATUS "Building x86-64 wheel with AVX2 support") + else() + # Fallback for other architectures + list(APPEND RELEASE_FLAGS "-march=native") + message(WARNING "Unknown architecture ${CMAKE_SYSTEM_PROCESSOR}, using -march=native") + endif() # Windows runs out of ram with LINK_TIME_OPT on gh-actions if(NOT WIN32) - list(APPEND RELEASE_FLAGS "-mtune=skylake") + if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|AMD64") + list(APPEND RELEASE_FLAGS "-mtune=skylake") + endif() endif() set(LINK_TIME_OPT TRUE) ### DO LTO - Recommended for full release set(CLANG_MAX_INLINE_DEPTH 400) @@ -154,18 +160,14 @@ endif() if(NOT WIN32) list(APPEND RELEASE_FLAGS "-fomit-frame-pointer") list(APPEND RELEASE_FLAGS "-fno-stack-protector") - #list(APPEND RELEASE_FLAGS "-fno-stack-clash-protection") #list(APPEND RELEASE_FLAGS "-fcf-protection=none") list(APPEND RELEASE_FLAGS "-fno-asynchronous-unwind-tables") list(APPEND RELEASE_FLAGS "-ffast-math") + if (NOT APPLE) + #list(APPEND RELEASE_FLAGS "-fno-stack-clash-protection") + endif() endif() -if(APPLE) - list(APPEND COMMON_FLAGS "-Xlinker -undefined") - list(APPEND COMMON_FLAGS "-Xlinker dynamic_lookup") -endif() - - # Combine Flags set(COMPILE_FLAGS PUBLIC ${COMMON_FLAGS}) @@ -210,17 +212,20 @@ endif() ###################### Find and set our dependencies ########################### ################################################################################ -if(APPLE) - include(FindPythonEnv) -endif() +#if(APPLE) +# include(FindPythonEnv) +#endif() + # Set up submodule dependencies include_directories(dep/eigen) include_directories(dep/fmt/include) include_directories(dep/autodiff) -add_subdirectory(dep) +find_package(Python ${PYVERSION_EXACT} COMPONENTS Interpreter Development REQUIRED) set(PYBIND11_CPP_STANDARD -std=c++17) +set(PYBIND11_FINDPYTHON ON) +add_subdirectory(dep) # Set up external dependencies find_package(Threads REQUIRED) @@ -233,38 +238,70 @@ if(UNIX AND NOT APPLE) list(APPEND COMPILE_FLAGS "-fopenmp") endif() elseif(APPLE) - if(FALSE) - include_directories("/usr/local/opt/llvm/include") - link_directories("/usr/local/opt/llvm/lib") - set(CMAKE_LIBRARY_PATH /usr/local/opt/llvm/lib ${CMAKE_LIBRARY_PATH}) - set(CMAKE_LIBRARY_PATH $ENV{MKLROOT}/lib ${CMAKE_LIBRARY_PATH}) - if(CMAKE_C_COMPILER_ID MATCHES "Clang") - set(OpenMP_C "${CMAKE_C_COMPILER}") - set(OpenMP_C_FLAGS "-fopenmp=libomp") - set(OpenMP_C_LIB_NAMES "libomp") - set(OpenMP_libomp_LIBRARY "omp") - endif() - if(CMAKE_CXX_COMPILER_ID MATCHES "Clang") - set(OpenMP_CXX "${CMAKE_CXX_COMPILER}") - set(OpenMP_CXX_FLAGS "-fopenmp=libomp") - set(OpenMP_CXX_LIB_NAMES "libomp") - set(OpenMP_libomp_LIBRARY "omp") + if(CMAKE_CXX_COMPILER_ID MATCHES "Clang") + if(CMAKE_CXX_COMPILER_ID MATCHES "AppleClang") + # fina_package(OpenMP REQUIRED) will fail with AppleClang so + # we need this to point towards the homebrew libomp + include_directories("/opt/homebrew/Cellar/libomp/20.1.8/include/") + link_directories("/opt/homebrew/Cellar/libomp/20.1.8/lib/") + list(APPEND COMPILE_FLAGS "-Xclang") + list(APPEND COMPILE_FLAGS "-fopenmp") + list(APPEND OpenMP_CXX_FLAGS "-lomp") + else() + # With homebrew LLVM clang compiler on Mac, this seems to work well for + # finding homebrew OpenMP find_package(OpenMP REQUIRED) - list(APPEND COMPILE_FLAGS ${OpenMP_CXX_FLAGS}) - endif() - + list(APPEND COMPILE_FLAGS "-fopenmp") + list(APPEND OpenMP_CXX_FLAGS "-lomp") + + # Add flags to make LLVM Clang more compatible with Apple SDK headers + list(APPEND COMPILE_FLAGS "-Wno-elaborated-enum-base") + endif() endif() - endif() -find_package(MKL REQUIRED) +if(APPLE) + find_package(AccelerateSparse REQUIRED) + add_compile_definitions(USE_ACCELERATE_SPARSE) +else() + find_package(MKL REQUIRED) +endif() find_package(Python ${PYVERSION_EXACT} REQUIRED COMPONENTS Interpreter Development) # Set dependency variables -set(INCLUDE_DIRS ${PYBIND11_INCLUDE_DIR} ${PYTHON_INCLUDE_DIRS} ${MKL_INCLUDE_DIRS}) -set(LINK_LIBS ${PYTHON_LIBRARIES} ${MKL_LIBRARIES} Threads::Threads ${CMAKE_DL_LIBS}) +set(INCLUDE_DIRS ${PYBIND11_INCLUDE_DIR} ${MKL_INCLUDE_DIRS} ${AccelerateSparse_INCLUDE_DIRS}) +set(LINK_LIBS ${MKL_LIBRARIES} Threads::Threads ${CMAKE_DL_LIBS} ${AccelerateSparse_LIBRARIES}) + +# Handle OpenMP for Apple +if (APPLE) + if(CMAKE_CXX_COMPILER_ID MATCHES "AppleClang") + list(APPEND LINK_LIBS ${OpenMP_CXX_FLAGS}) + else() + list(APPEND LINK_LIBS OpenMP::OpenMP_CXX) + endif() +endif() + +# Handle Python executable variable +if (NOT DEFINED PYTHON_EXECUTABLE) + set(PYTHON_EXECUTABLE ${Python_EXECUTABLE}) +endif() +if(DEFINED Python_INCLUDE_DIRS) + list(APPEND INCLUDE_DIRS ${Python_INCLUDE_DIRS}) +elseif(DEFINED PYTHON_INCLUDE_DIRS) + list(APPEND INCLUDE_DIRS ${PYTHON_INCLUDE_DIRS}) +endif() +if (NOT APPLE) + # Don't directly link with python library on mac and instead rely on -undefined dynamic_lookup + # to bind to the python symbols at runtime from the interpreter + if(DEFINED Python_LIBRARIES) + list(APPEND LINK_LIBS ${Python_LIBRARIES}) + elseif(DEFINED PYTHON_LIBRARIES) + list(APPEND LINK_LIBS ${PYTHON_LIBRARIES}) + endif() +endif() + if(UNIX) list(APPEND LINK_LIBS m) endif() @@ -273,13 +310,22 @@ endif() ################################################################################ ################ Linker Flags ################################################## + +# Set linker for Clang after threading detection is complete +if(NOT WIN32) + if(${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang") + set(CMAKE_EXE_LINKER_FLAGS "-fuse-ld=lld") + endif() +endif() + if(BUILD_SHARED_LIBS AND NOT APPLE AND NOT WIN32) # Windows and Linux Dynamic Linking list(APPEND LINKER_FLAGS "${OpenMP_CXX_FLAGS} -Wl,--no-undefined -Wl,--start-group ${MKL_LIBRARIES_LIST} -Wl, --end-group") elseif(UNIX AND NOT APPLE) # Linux Static Linking list(APPEND LINKER_FLAGS "${OpenMP_CXX_FLAGS} -Wl,--no-undefined -Wl,--start-group ${MKL_LIBRARIES_LIST} -Wl,--end-group") elseif(APPLE) # Apple Dynamic and Static Linking - - list(APPEND LINKER_FLAGS "${OpenMP_CXX_FLAGS} -Xlinker -undefined -Xlinker dynamic_lookup") + list(APPEND LINKER_FLAGS ${OpenMP_CXX_FLAGS}) + list(APPEND LINKER_FLAGS "-Wl,-bind_at_load") + list(APPEND LINKER_FLAGS "-Wl,-undefined,dynamic_lookup") else() # Windows Static Linking list(APPEND LINKER_FLAGS "${OpenMP_CXX_FLAGS}") endif() @@ -328,4 +374,3 @@ add_subdirectory(pypiwheel) # Formatting include(cmake/clang-format.cmake) - diff --git a/cmake/FindAccelerateSparse.cmake b/cmake/FindAccelerateSparse.cmake new file mode 100644 index 00000000..f15c60b2 --- /dev/null +++ b/cmake/FindAccelerateSparse.cmake @@ -0,0 +1,143 @@ +# Ceres Solver - A fast non-linear least squares minimizer +# Copyright 2023 Google Inc. All rights reserved. +# http://ceres-solver.org/ +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# * Neither the name of Google Inc. nor the names of its contributors may be +# used to endorse or promote products derived from this software without +# specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +# +# Author: alexs.mac@gmail.com (Alex Stewart) +# +# FindAccelerateSparse.cmake - Find the sparse solvers in Apple's Accelerate +# framework, introduced in Xcode 9.0 (2017). +# Note that this is distinct from the Accelerate +# framework on its own, which existed in previous +# versions but without the sparse solvers. +# +# This module defines the following variables which should be referenced +# by the caller to use the library. +# +# AccelerateSparse_FOUND: TRUE iff an Accelerate framework including the sparse +# solvers, and all dependencies, has been found. +# AccelerateSparse_INCLUDE_DIRS: Include directories for Accelerate framework. +# AccelerateSparse_LIBRARIES: Libraries for Accelerate framework and all +# dependencies. +# +# The following variables are also defined by this module, but in line with +# CMake recommended FindPackage() module style should NOT be referenced directly +# by callers (use the plural variables detailed above instead). These variables +# do however affect the behaviour of the module via FIND_[PATH/LIBRARY]() which +# are NOT re-called (i.e. search for library is not repeated) if these variables +# are set with valid values _in the CMake cache_. This means that if these +# variables are set directly in the cache, either by the user in the CMake GUI, +# or by the user passing -DVAR=VALUE directives to CMake when called (which +# explicitly defines a cache variable), then they will be used verbatim, +# bypassing the HINTS variables and other hard-coded search locations. +# +# AccelerateSparse_INCLUDE_DIR: Include directory for Accelerate framework, not +# including the include directory of any +# dependencies. +# AccelerateSparse_LIBRARY: Accelerate framework, not including the libraries of +# any dependencies. +# Called if we failed to find the Accelerate framework with the sparse solvers. +# Unsets all public (designed to be used externally) variables and reports +# error message at priority depending upon [REQUIRED/QUIET/] argument. +macro(accelerate_sparse_report_not_found REASON_MSG) + unset(AccelerateSparse_FOUND) + unset(AccelerateSparse_INCLUDE_DIRS) + unset(AccelerateSparse_LIBRARIES) + # Make results of search visible in the CMake GUI if Accelerate has not + # been found so that user does not have to toggle to advanced view. + mark_as_advanced(CLEAR AccelerateSparse_INCLUDE_DIR + AccelerateSparse_LIBRARY) + # Note _FIND_[REQUIRED/QUIETLY] variables defined by FindPackage() + # use the camelcase library name, not uppercase. + if (AccelerateSparse_FIND_QUIETLY) + message(STATUS "Failed to find Accelerate framework with sparse solvers - " + ${REASON_MSG} ${ARGN}) + elseif (AccelerateSparse_FIND_REQUIRED) + message(FATAL_ERROR "Failed to find Accelerate framework with sparse solvers - " + ${REASON_MSG} ${ARGN}) + else() + # Neither QUIETLY nor REQUIRED, use no priority which emits a message + # but continues configuration and allows generation. + message("-- Failed to find Accelerate framework with sparse solvers - " + ${REASON_MSG} ${ARGN}) + endif() + return() +endmacro() +unset(AccelerateSparse_FOUND) +find_path(AccelerateSparse_INCLUDE_DIR NAMES Accelerate.h) +if (NOT AccelerateSparse_INCLUDE_DIR OR + NOT EXISTS ${AccelerateSparse_INCLUDE_DIR}) + accelerate_sparse_report_not_found( + "Could not find Accelerate framework headers. Set " + "AccelerateSparse_INCLUDE_DIR to the directory containing Accelerate.h") +endif() +find_library(AccelerateSparse_LIBRARY NAMES Accelerate) +if (NOT AccelerateSparse_LIBRARY OR + NOT EXISTS ${AccelerateSparse_LIBRARY}) + accelerate_sparse_report_not_found( + "Could not find Accelerate framework. Set AccelerateSparse_LIBRARY " + "to the Accelerate.framework directory") +endif() +set(AccelerateSparse_FOUND TRUE) +# Determine if the Accelerate framework detected includes the sparse solvers. +# Skip the test for LLVM Clang due to compatibility issues with Apple SDK headers +if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang" AND NOT "${CMAKE_CXX_COMPILER_VERSION}" MATCHES "Apple") + # For LLVM Clang, assume the sparse solvers are available on macOS + set(ACCELERATE_FRAMEWORK_HAS_SPARSE_SOLVER TRUE) + message(STATUS "Skipping Accelerate framework sparse solver test for LLVM Clang") +else() + # For other compilers, perform the test + include(CheckCXXSourceCompiles) + set(CMAKE_REQUIRED_INCLUDES ${AccelerateSparse_INCLUDE_DIR}) + set(CMAKE_REQUIRED_LIBRARIES ${AccelerateSparse_LIBRARY}) + check_cxx_source_compiles( + "#include + int main() { + SparseMatrix_Double A; + SparseFactor(SparseFactorizationCholesky, A); + return 0; + }" + ACCELERATE_FRAMEWORK_HAS_SPARSE_SOLVER) + unset(CMAKE_REQUIRED_INCLUDES) + unset(CMAKE_REQUIRED_LIBRARIES) + if (NOT ACCELERATE_FRAMEWORK_HAS_SPARSE_SOLVER) + accelerate_sparse_report_not_found( + "Detected Accelerate framework: ${AccelerateSparse_LIBRARY} does not " + "include the sparse solvers.") + endif() +endif() +if (AccelerateSparse_FOUND) + set(AccelerateSparse_INCLUDE_DIRS ${AccelerateSparse_INCLUDE_DIR}) + set(AccelerateSparse_LIBRARIES ${AccelerateSparse_LIBRARY}) +endif() +# Handle REQUIRED / QUIET optional arguments and version. +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(AccelerateSparse + REQUIRED_VARS AccelerateSparse_INCLUDE_DIRS AccelerateSparse_LIBRARIES) +if (AccelerateSparse_FOUND) + mark_as_advanced(FORCE AccelerateSparse_INCLUDE_DIR + AccelerateSparse_LIBRARY) +endif() \ No newline at end of file diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index ce502b62..8f1ecdf1 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -155,11 +155,9 @@ target_link_libraries(asset PRIVATE ${LINK_LIBS}) #set_target_properties(asset PROPERTIES CXX_VISIBILITY_PRESET hidden) if(UNIX) - set_target_properties(asset PROPERTIES LINK_FLAGS ${LINKER_FLAGS}) + target_link_options(asset PRIVATE ${LINKER_FLAGS}) endif() - - add_custom_target(pyassetsrc ALL DEPENDS ${CMAKE_SOURCE_DIR}/asset_asrl) @@ -184,13 +182,3 @@ if(PYTHON_LOCAL_INSTALL_DIR) COMMENT "Copied asset_asrl to ${PYTHON_LOCAL_INSTALL_DIR}") endif() - - - - - - - - - - diff --git a/src/Solvers/ASSET_Solvers.h b/src/Solvers/ASSET_Solvers.h index 94d25cbc..6247eb08 100644 --- a/src/Solvers/ASSET_Solvers.h +++ b/src/Solvers/ASSET_Solvers.h @@ -7,7 +7,11 @@ #include "OptimizationProblem.h" #include "OptimizationProblemBase.h" #include "PSIOPT.h" +#ifdef USE_ACCELERATE_SPARSE +#include "AccelerateInterface.h" +#else #include "mkl.h" +#endif namespace ASSET { @@ -15,7 +19,9 @@ namespace ASSET { // auto sol = m.def_submodule("Solvers","SubModule Containing PSIOPT,NLP, and Solver Flags"); auto& sol = reg.getSolversModule(); +#ifndef USE_ACCELERATE_SPARSE int DSECOND = dsecnd(); +#endif PSIOPT::Build(sol); OptimizationProblemBase::Build(sol); Jet::Build(sol); diff --git a/src/Solvers/AccelerateInterface.h b/src/Solvers/AccelerateInterface.h new file mode 100644 index 00000000..f6e1189d --- /dev/null +++ b/src/Solvers/AccelerateInterface.h @@ -0,0 +1,841 @@ +#ifndef EIGEN_ACCELERATESUPPORT_H +#define EIGEN_ACCELERATESUPPORT_H + +#include "AccelerateUtils.h" + +#include +#include + +#include + +#include +#include + +/* +The classes in this file are directly based on the AccelerateSupport module from Eigen 3.4 and +are subject to Eigen's MPL2 license, which can be found in the notices folder of the GitHub repository. +Changes include the addition of several member variables and functions to provide more fine grained +control of the Accelerate Sparse solvers, to avoid making unnecessary copies/allocations, to add an +iterative refinement implementation (to bring solution accuracy more in line with PARDISO), as well as +to more closely align with the methods of the PARDISO interface in PardisoInterface.h (particularly +for methods that ASSET employs extensively). Note that the MPL2 license is only applied to this particular +file and not the rest of the project, as per the MPL2 license. + +*/ + +namespace Eigen { + +template +class AccelerateImpl; + +/** \ingroup AccelerateSupport_Module + * \typedef AccelerateLLT + * \brief A direct Cholesky (LLT) factorization and solver based on Accelerate + * + * \warning Only single and double precision real scalar types are supported by Accelerate + * + * \tparam MatrixType_ the type of the sparse matrix A, it must be a SparseMatrix<> + * \tparam UpLo_ additional information about the matrix structure. Default is Lower. + * + * \sa \ref TutorialSparseSolverConcept, class AccelerateLLT + */ +template +using AccelerateLLT = AccelerateImpl; + +/** \ingroup AccelerateSupport_Module + * \typedef AccelerateLDLT + * \brief The default Cholesky (LDLT) factorization and solver based on Accelerate + * + * \warning Only single and double precision real scalar types are supported by Accelerate + * + * \tparam MatrixType_ the type of the sparse matrix A, it must be a SparseMatrix<> + * \tparam UpLo_ additional information about the matrix structure. Default is Lower. + * + * \sa \ref TutorialSparseSolverConcept, class AccelerateLDLT + */ +template +using AccelerateLDLT = AccelerateImpl; + +/** \ingroup AccelerateSupport_Module + * \typedef AccelerateLDLTUnpivoted + * \brief A direct Cholesky-like LDL^T factorization and solver based on Accelerate with only 1x1 pivots and no pivoting + * + * \warning Only single and double precision real scalar types are supported by Accelerate + * + * \tparam MatrixType_ the type of the sparse matrix A, it must be a SparseMatrix<> + * \tparam UpLo_ additional information about the matrix structure. Default is Lower. + * + * \sa \ref TutorialSparseSolverConcept, class AccelerateLDLTUnpivoted + */ +template +using AccelerateLDLTUnpivoted = AccelerateImpl; + +/** \ingroup AccelerateSupport_Module + * \typedef AccelerateLDLTSBK + * \brief A direct Cholesky (LDLT) factorization and solver based on Accelerate with Supernode Bunch-Kaufman and static + * pivoting + * + * \warning Only single and double precision real scalar types are supported by Accelerate + * + * \tparam MatrixType_ the type of the sparse matrix A, it must be a SparseMatrix<> + * \tparam UpLo_ additional information about the matrix structure. Default is Lower. + * + * \sa \ref TutorialSparseSolverConcept, class AccelerateLDLTSBK + */ +template +using AccelerateLDLTSBK = AccelerateImpl; + +/** \ingroup AccelerateSupport_Module + * \typedef AccelerateLDLTTPP + * \brief A direct Cholesky (LDLT) factorization and solver based on Accelerate with full threshold partial pivoting + * + * \warning Only single and double precision real scalar types are supported by Accelerate + * + * \tparam MatrixType_ the type of the sparse matrix A, it must be a SparseMatrix<> + * \tparam UpLo_ additional information about the matrix structure. Default is Lower. + * + * \sa \ref TutorialSparseSolverConcept, class AccelerateLDLTTPP + */ +template +using AccelerateLDLTTPP = AccelerateImpl; + +/** \ingroup AccelerateSupport_Module + * \typedef AccelerateQR + * \brief A QR factorization and solver based on Accelerate + * + * \warning Only single and double precision real scalar types are supported by Accelerate + * + * \tparam MatrixType_ the type of the sparse matrix A, it must be a SparseMatrix<> + * + * \sa \ref TutorialSparseSolverConcept, class AccelerateQR + */ +template +using AccelerateQR = AccelerateImpl; + +/** \ingroup AccelerateSupport_Module + * \typedef AccelerateCholeskyAtA + * \brief A QR factorization and solver based on Accelerate without storing Q (equivalent to A^TA = R^T R) + * + * \warning Only single and double precision real scalar types are supported by Accelerate + * + * \tparam MatrixType_ the type of the sparse matrix A, it must be a SparseMatrix<> + * + * \sa \ref TutorialSparseSolverConcept, class AccelerateCholeskyAtA + */ +template +using AccelerateCholeskyAtA = AccelerateImpl; + +namespace internal { +template +struct AccelFactorizationDeleter { + void operator()(T* sym) { + if (sym) { + SparseCleanup(*sym); + delete sym; + sym = nullptr; + } + } +}; + +template +struct SparseTypesTraitBase { + typedef DenseVecT AccelDenseVector; + typedef DenseMatT AccelDenseMatrix; + typedef SparseMatT AccelSparseMatrix; + + typedef SparseOpaqueSymbolicFactorization SymbolicFactorization; + typedef NumFactT NumericFactorization; + + typedef AccelFactorizationDeleter SymbolicFactorizationDeleter; + typedef AccelFactorizationDeleter NumericFactorizationDeleter; +}; + +template +struct SparseTypesTrait {}; + +template <> +struct SparseTypesTrait : SparseTypesTraitBase {}; + +template <> +struct SparseTypesTrait + : SparseTypesTraitBase { +}; + +// Taken from https://github.com/ceres-solver/ceres-solver/blob/master/internal/ceres/accelerate_sparse.cc +inline void* resizeForAccelerateAlignment(const size_t required_size, + std::vector* mem_vec) +{ + // As per the Accelerate documentation, all workspace memory passed to the + // sparse solver functions must be 16-byte aligned. + constexpr int kAccelerateRequiredAlignment = 16; + // Although malloc() on macOS should always be 16-byte aligned, it is unclear + // if this holds for new(), or on other Apple OSs (phoneOS, watchOS etc). + // As such we assume it is not and use std::align() to create a (potentially + // offset) 16-byte aligned sub-buffer of the specified size within workspace. + mem_vec->resize(required_size + kAccelerateRequiredAlignment); + size_t size_from_aligned_start = mem_vec->size(); + void* aligned_solve_workspace_start = + reinterpret_cast(mem_vec->data()); + aligned_solve_workspace_start = std::align(kAccelerateRequiredAlignment, + required_size, + aligned_solve_workspace_start, + size_from_aligned_start); + return aligned_solve_workspace_start; +} + +} // end namespace internal + +template +class AccelerateImpl : public SparseSolverBase > { + protected: + using Base = SparseSolverBase; + using Base::derived; + using Base::m_isInitialized; + + public: + using Base::_solve_impl; + + typedef MatrixType_ MatrixType; + typedef typename MatrixType::Scalar Scalar; + typedef typename MatrixType::StorageIndex StorageIndex; + enum { ColsAtCompileTime = Dynamic, MaxColsAtCompileTime = Dynamic }; + enum { UpLo = UpLo_ }; + + using AccelDenseVector = typename internal::SparseTypesTrait::AccelDenseVector; + using AccelDenseMatrix = typename internal::SparseTypesTrait::AccelDenseMatrix; + using AccelSparseMatrix = typename internal::SparseTypesTrait::AccelSparseMatrix; + using SymbolicFactorization = typename internal::SparseTypesTrait::SymbolicFactorization; + using NumericFactorization = typename internal::SparseTypesTrait::NumericFactorization; + using SymbolicFactorizationDeleter = typename internal::SparseTypesTrait::SymbolicFactorizationDeleter; + using NumericFactorizationDeleter = typename internal::SparseTypesTrait::NumericFactorizationDeleter; + + AccelerateImpl() { + m_isInitialized = false; + + auto check_flag_set = [](int value, int flag) { return ((value & flag) == flag); }; + + if (check_flag_set(UpLo_, Symmetric)) { + m_sparseKind = SparseSymmetric; + m_triType = (UpLo_ & Lower) ? SparseLowerTriangle : SparseUpperTriangle; + } else if (check_flag_set(UpLo_, UnitLower)) { + m_sparseKind = SparseUnitTriangular; + m_triType = SparseLowerTriangle; + } else if (check_flag_set(UpLo_, UnitUpper)) { + m_sparseKind = SparseUnitTriangular; + m_triType = SparseUpperTriangle; + } else if (check_flag_set(UpLo_, StrictlyLower)) { + m_sparseKind = SparseTriangular; + m_triType = SparseLowerTriangle; + } else if (check_flag_set(UpLo_, StrictlyUpper)) { + m_sparseKind = SparseTriangular; + m_triType = SparseUpperTriangle; + } else if (check_flag_set(UpLo_, Lower)) { + m_sparseKind = SparseTriangular; + m_triType = SparseLowerTriangle; + } else if (check_flag_set(UpLo_, Upper)) { + m_sparseKind = SparseTriangular; + m_triType = SparseUpperTriangle; + } else { + m_sparseKind = SparseOrdinary; + m_triType = (UpLo_ & Lower) ? SparseLowerTriangle : SparseUpperTriangle; + } + + m_order = SparseOrderMetis; // Use METIS ordering by default for better performance + m_doIterativeRefinement = false; + m_iterativeRefinementIterations = 2; + } + + explicit AccelerateImpl(const MatrixType& matrix) : AccelerateImpl() { compute(matrix); } + + ~AccelerateImpl() {} + + inline Index cols() const { return m_nCols; } + inline Index rows() const { return m_nRows; } + + ComputationInfo info() const { + eigen_assert(m_isInitialized && "Decomposition is not initialized."); + return m_info; + } + + void setMatrix(const MatrixType& matrix); + + void analyzePattern(const MatrixType& matrix); + + void factorize(const MatrixType& matrix); + + void compute(const MatrixType& matrix); + + template + void _solve_impl(const MatrixBase& b, MatrixBase& dest) const; + + /** Sets the ordering algorithm to use. */ + void setOrder(SparseOrder_t order) { m_order = order; } + + /** Sets the number of threads for accelerate */ + inline void setNumThreads(int num_threads) { + accelerate_set_num_threads(num_threads); + } + + void setIterativeRefinement(bool iterativeRefinement) { + m_doIterativeRefinement = iterativeRefinement; + } + + void setIterativeRefinementIterations(int iterations) { + eigen_assert(iterations >= 0 && "Number of iterations must be non-negative."); + m_iterativeRefinementIterations = iterations; + } + + MatrixType& getMatrix() { return m_matrix; } + + template + typename std::enable_if::type + getMatrix(const MatrixType& matrix) { + m_matrix = matrix; + m_matrix.makeCompressed(); + } + + template + typename std::enable_if::type + getMatrix(const MatrixType& matrix) { + // Similar to Pardiso, use selfadjointView to ensure symmetry + PermutationMatrix p_null; + m_matrix.resize(matrix.rows(), matrix.cols()); + + constexpr int TriangleType = (UpLo & Lower) ? Lower : Upper; + m_matrix.template selfadjointView() = + matrix.template selfadjointView().twistedBy(p_null); + m_matrix.makeCompressed(); + } + + template + typename std::enable_if::type + getMatrixTwisted(const MatrixType& matrix) { + eigen_assert(!m_permutation.empty() && "Permutation not available. Call compute() or analyzePattern() first."); + eigen_assert(matrix.rows() == matrix.cols() && "Matrix must be square for twisted operation."); + eigen_assert(static_cast(matrix.rows()) == m_permutation.size() && "Matrix size must match permutation size."); + + // Create permutation matrix from the stored permutation vector + PermutationMatrix p_perm; + p_perm.indices() = Map>(m_permutation.data(), m_permutation.size()); + + MatrixType result; + result.resize(matrix.rows(), matrix.cols()); + + constexpr int TriangleType = (U & Lower) ? Lower : Upper; + result.template selfadjointView() = + matrix.template selfadjointView().twistedBy(p_perm); + + result.makeCompressed(); + return result; + } + + template + typename std::enable_if::type + peigs() const { + eigen_assert(m_numericFactorization && "Numerical factorization must be computed first."); + + int num_positive = 0, num_zero = 0, num_negative = 0; + int status = SparseGetInertia(*m_numericFactorization, &num_positive, &num_zero, &num_negative); + + if (status != 0) { + // If SparseGetInertia fails, return -1 to indicate error + return -1; + } + + return static_cast(num_positive); + } + + template + typename std::enable_if::type + neigs() const { + eigen_assert(m_numericFactorization && "Numerical factorization must be computed first."); + + int num_positive = 0, num_zero = 0, num_negative = 0; + int status = SparseGetInertia(*m_numericFactorization, &num_positive, &num_zero, &num_negative); + + if (status != 0) { + // If SparseGetInertia fails, return -1 to indicate error + return -1; + } + + return static_cast(num_negative); + } + + template + typename std::enable_if::type + zeigs() const { + eigen_assert(m_numericFactorization && "Numerical factorization must be computed first."); + + int num_positive = 0, num_zero = 0, num_negative = 0; + int status = SparseGetInertia(*m_numericFactorization, &num_positive, &num_zero, &num_negative); + + if (status != 0) { + // If SparseGetInertia fails, return -1 to indicate error + return -1; + } + + return static_cast(num_zero); + } + + inline int ppivs() const { + // Accelerate doesn't provide direct pivot perturbation count + // Return 0 as a safe default for now + return 0; + } + + // This method initializes the internal AccelSparseMatrix. This must be called + // after changing the sparsity pattern of m_matrix via the reference returned + // from MatrixType& getMatrix() + void reinitializeInternalMatrixRepresentation(); + + // Internal factorization methods + AccelerateImpl& analyzePattern_internal(); + AccelerateImpl& factorize_internal(); + AccelerateImpl& compute_internal(); + + // Cleanup method + void release(); + + // Performance metrics + mutable int m_flops = 0; + mutable int m_mem = 0; + + private: + void buildAccelSparseMatrix() { + SparseAttributes_t attributes{}; + attributes.kind = m_sparseKind; + + SparseMatrixStructure structure{}; + structure.blockSize = 1; + + if ((MatrixType::Flags & Eigen::ColMajor)) { // CSC format + const Index nColumnsStarts = m_matrix.cols() + 1; + m_columnStarts.resize(nColumnsStarts); + std::copy_n(m_matrix.outerIndexPtr(), nColumnsStarts, m_columnStarts.data()); + + structure.rowCount = static_cast(m_matrix.rows()); + structure.columnCount = static_cast(m_matrix.cols()); + structure.columnStarts = m_columnStarts.data(); + structure.rowIndices = const_cast(m_matrix.innerIndexPtr()); + attributes.transpose = false; + attributes.triangle = m_triType; + } else { // RowMajor (CSR) format + // For CSR, Accelerate expects CSC. We use the 'transpose' attribute + // to tell Accelerate to interpret the CSR matrix as a transposed CSC matrix. + const Index nRowStarts = m_matrix.rows() + 1; + m_columnStarts.resize(nRowStarts); // Reuse m_columnStarts for rowStarts + std::copy_n(m_matrix.outerIndexPtr(), nRowStarts, m_columnStarts.data()); + + structure.rowCount = static_cast(m_matrix.cols()); // Swapped + structure.columnCount = static_cast(m_matrix.rows()); // Swapped + structure.columnStarts = m_columnStarts.data(); // These are now rowStarts + structure.rowIndices = const_cast(m_matrix.innerIndexPtr()); // These are now columnIndices + attributes.transpose = true; + + // When transposing, we need to flip the triangle type for symmetric matrices + if (m_sparseKind == SparseSymmetric) { + attributes.triangle = (m_triType == SparseLowerTriangle) ? SparseUpperTriangle : SparseLowerTriangle; + } else { + attributes.triangle = m_triType; + } + } + + structure.attributes = attributes; + m_accel_matrix.structure = structure; + m_accel_matrix.data = const_cast(m_matrix.valuePtr()); + } + + void doAnalysis() { + m_numericFactorization.reset(nullptr); + + // Only resize permutation if necessary to avoid unnecessary allocations + if (m_permutation.size() != static_cast(m_nRows)) { + m_permutation.resize(m_nRows); + } + std::iota(m_permutation.begin(), m_permutation.end(), 0); // Initialize with identity + + SparseSymbolicFactorOptions fopts{}; + fopts.control = SparseDefaultControl; + fopts.orderMethod = m_order; + fopts.order = m_permutation.data(); // Provide storage for computed permutation + fopts.ignoreRowsAndColumns = nullptr; + fopts.malloc = malloc; + fopts.free = free; + + fopts.reportError = [](const char* msg) { + fmt::print(fmt::fg(fmt::color::red), "Accelerate Sparse Symbolic Factorization Error: {}\n", msg); + }; + + m_symbolicFactorization.reset(new SymbolicFactorization(SparseFactor(Solver_, m_accel_matrix.structure, fopts))); + + SparseStatus_t status = m_symbolicFactorization->status; + + updateInfoStatus(status); + + if (status != SparseStatusOK) { + m_symbolicFactorization.reset(nullptr); + } + } + + void doFactorization() { + SparseStatus_t status = SparseStatusReleased; + + if (m_symbolicFactorization) { + + SparseNumericFactorOptions nopts{}; + nopts.control = SparseDefaultControl; + nopts.scalingMethod = SparseScalingDefault; + nopts.scaling = nullptr; + // Default values set by Apple + nopts.pivotTolerance = 0.01; // Recommended value for difficult matrices in double + nopts.zeroTolerance = 1e-4 * __DBL_EPSILON__; // "A few" orders of magnitude below epsilon. + + // Get factor and workspace size + const int factorSize = + std::is_same::value + ? m_symbolicFactorization->factorSize_Double + : m_symbolicFactorization->factorSize_Float; + const int workspaceSize = + std::is_same::value + ? m_symbolicFactorization->workspaceSize_Double + : m_symbolicFactorization->workspaceSize_Float; + + m_numericFactorization.reset(new NumericFactorization(SparseFactor( + *m_symbolicFactorization, m_accel_matrix, nopts, + internal::resizeForAccelerateAlignment(factorSize, &m_factorStorage), + internal::resizeForAccelerateAlignment(workspaceSize, &m_workspace)))); + + status = m_numericFactorization->status; + + if (status != SparseStatusOK) m_numericFactorization.reset(nullptr); + } + + updateInfoStatus(status); + } + + protected: + void updateInfoStatus(SparseStatus_t status) const { + switch (status) { + case SparseStatusOK: + m_info = Success; + break; + case SparseFactorizationFailed: + case SparseMatrixIsSingular: + m_info = NumericalIssue; + break; + case SparseInternalError: + case SparseParameterError: + case SparseStatusReleased: + default: + m_info = InvalidInput; + break; + } + } + + std::vector m_columnStarts; + mutable MatrixType m_matrix; + mutable AccelSparseMatrix m_accel_matrix; + mutable ComputationInfo m_info; + mutable std::vector m_factorStorage; + mutable std::vector m_workspace; + mutable std::vector m_solve_workspace; // Cache solve workspace + mutable std::vector m_r_mem; + mutable int m_cached_solve_workspace_size = 0; // Track cached size + Index m_nRows, m_nCols; + std::unique_ptr m_symbolicFactorization; + std::unique_ptr m_numericFactorization; + SparseKind_t m_sparseKind; + SparseTriangle_t m_triType; + SparseOrder_t m_order; + bool m_doIterativeRefinement; + int m_iterativeRefinementIterations; + mutable std::vector m_permutation; // Store permutation from factorization +}; + +template +void AccelerateImpl::setMatrix(const MatrixType& matrix) { + if (EnforceSquare_) { + eigen_assert(matrix.rows() == matrix.cols()); + } + + //m_matrix = matrix; + getMatrix(matrix); + m_nRows = m_matrix.rows(); + m_nCols = m_matrix.cols(); + + buildAccelSparseMatrix(); + + m_isInitialized = false; + m_symbolicFactorization.reset(nullptr); + m_numericFactorization.reset(nullptr); + m_cached_solve_workspace_size = 0; // Clear cached workspace size + m_info = Success; +} + +/** Computes the symbolic and numeric decomposition of matrix \a a */ +template +void AccelerateImpl::compute(const MatrixType& a) { + setMatrix(a); + + doAnalysis(); + + if (m_symbolicFactorization) doFactorization(); + + m_isInitialized = true; +} + +/** Performs a symbolic decomposition on the sparsity pattern of matrix \a a. + * + * This function is particularly useful when solving for several problems having the same structure. + * + * \sa factorize() + */ +template +void AccelerateImpl::analyzePattern(const MatrixType& a) { + if (EnforceSquare_) { + eigen_assert(a.rows() == a.cols()); + } + setMatrix(a); + + doAnalysis(); + + m_isInitialized = true; +} + +/** Performs a numeric decomposition of matrix \a a. + * + * The given matrix must have the same sparsity pattern as the matrix on which the symbolic decomposition has been + * performed. + * + * \sa analyzePattern() + */ +template +void AccelerateImpl::factorize(const MatrixType& a) { + eigen_assert(m_symbolicFactorization && "You must first call analyzePattern()"); + eigen_assert(m_nRows == a.rows() && m_nCols == a.cols()); + + if (EnforceSquare_) { + eigen_assert(a.rows() == a.cols()); + } + + m_matrix = a; + buildAccelSparseMatrix(); + m_numericFactorization.reset(nullptr); + + doFactorization(); +} + +template +template +void AccelerateImpl::_solve_impl(const MatrixBase& b, + MatrixBase& x) const { + if (!m_numericFactorization) { + m_info = InvalidInput; + return; + } + + eigen_assert(m_nRows == b.rows()); + eigen_assert(((b.cols() == 1) || b.outerStride() == b.rows())); + + SparseStatus_t status = SparseStatusOK; + + Scalar* b_ptr = const_cast(b.derived().data()); + Scalar* x_ptr = const_cast(x.derived().data()); + + AccelDenseMatrix xmat{}; + xmat.attributes = SparseAttributes_t(); + xmat.columnCount = static_cast(x.cols()); + xmat.rowCount = static_cast(x.rows()); + xmat.columnStride = xmat.rowCount; + xmat.data = x_ptr; + + AccelDenseMatrix bmat{}; + bmat.attributes = SparseAttributes_t(); + bmat.columnCount = static_cast(b.cols()); + bmat.rowCount = static_cast(b.rows()); + bmat.columnStride = bmat.rowCount; + bmat.data = b_ptr; + + const int nrhs = (bmat.attributes.transpose) ? bmat.rowCount : bmat.columnCount; + const int workspaceSize = m_numericFactorization->solveWorkspaceRequiredStatic + + nrhs*m_numericFactorization->solveWorkspaceRequiredPerRHS; + + // Use cached solve workspace to avoid repeated allocations + void* ws; + if (workspaceSize != m_cached_solve_workspace_size) { + ws = internal::resizeForAccelerateAlignment(workspaceSize, &m_solve_workspace); + m_cached_solve_workspace_size = workspaceSize; + } else { + // Reuse existing aligned workspace + constexpr int kAccelerateRequiredAlignment = 16; + ws = reinterpret_cast(m_solve_workspace.data() + + (kAccelerateRequiredAlignment - reinterpret_cast(m_solve_workspace.data()) % kAccelerateRequiredAlignment) % kAccelerateRequiredAlignment); + } + assert(ws != nullptr && "Accelerate workspace alignment failed"); + + SparseSolve(*m_numericFactorization, bmat, xmat, ws); + + updateInfoStatus(status); + + if (m_doIterativeRefinement) + { + auto n = vDSP_Length(x.rows() * x.cols()); + if (m_r_mem.size() < n) { + m_r_mem.resize(n); + } + AccelDenseMatrix ref_mat{}; + ref_mat.attributes = SparseAttributes_t(); + ref_mat.columnCount = static_cast(x.cols()); + ref_mat.rowCount = static_cast(x.rows()); + ref_mat.columnStride = ref_mat.rowCount; + ref_mat.data = m_r_mem.data(); + + for (int i = 0; i < m_iterativeRefinementIterations; ++i) { + // Calculate residual and store in ref_mat + vDSP_vnegD( + bmat.data, 1, + ref_mat.data, 1, n + ); + SparseMultiplyAdd(m_accel_matrix, xmat, ref_mat); + + // Solve for correction and store in ref_mat + SparseSolve(*m_numericFactorization, ref_mat, ws); + + // vDSP operation that calculates x -= correction + vDSP_vsubD( + ref_mat.data, 1, + xmat.data, 1, + xmat.data, 1, + n + ); + } + } +} + +/** Initializes the internal AccelSparseMatrix from the internal Eigen sparse matrix + * + * This method must be called after changing the sparsity pattern of the m_matrix + * member via the reference returned from MatrixType& getMatrix(). + * + */ +template +void AccelerateImpl::reinitializeInternalMatrixRepresentation() +{ + // Update matrix dimensions in case they changed when the sparsity pattern was modified + m_nRows = m_matrix.rows(); + m_nCols = m_matrix.cols(); + + // Build/rebuild the internal AccelSparseMatrix from the current state of m_matrix + buildAccelSparseMatrix(); + + // Reset factorizations since the matrix structure has changed + m_symbolicFactorization.reset(nullptr); + m_numericFactorization.reset(nullptr); + + // Clear cached workspace size since matrix structure changed + m_cached_solve_workspace_size = 0; + + // Reset initialization state - will need to recompute factorizations + m_isInitialized = false; + + // Reset computation info + m_info = Success; +} + +template +AccelerateImpl& +AccelerateImpl::analyzePattern_internal() { + eigen_assert(m_matrix.rows() > 0 && m_matrix.cols() > 0 && "Matrix must be set before calling analyzePattern_internal()"); + + m_symbolicFactorization.reset(nullptr); + doAnalysis(); + + m_isInitialized = true; + return *this; +} + +template +AccelerateImpl& +AccelerateImpl::factorize_internal() { + eigen_assert(m_symbolicFactorization && "You must first call analyzePattern_internal()"); + + m_numericFactorization.reset(nullptr); + doFactorization(); + + // Update performance metrics after factorization + if (m_numericFactorization) { + // Estimate memory usage based on factorization storage + m_mem = static_cast(m_factorStorage.size() + m_workspace.size()); + + // Estimate FLOP count based on matrix size and sparsity + // This is a rough estimate since Accelerate doesn't provide exact FLOP counts + Index nnz = m_matrix.nonZeros(); + m_flops = static_cast(nnz * std::log(static_cast(m_nRows))); + } + + return *this; +} + +template +AccelerateImpl& +AccelerateImpl::compute_internal() { + eigen_assert(m_matrix.rows() > 0 && m_matrix.cols() > 0 && "Matrix must be set before calling compute_internal()"); + + m_symbolicFactorization.reset(nullptr); + m_numericFactorization.reset(nullptr); + + doAnalysis(); + + if (m_symbolicFactorization) { + doFactorization(); + + // Update performance metrics after factorization + if (m_numericFactorization) { + // Estimate memory usage based on factorization storage + m_mem = static_cast(m_factorStorage.size() + m_workspace.size()); + + // Estimate FLOP count based on matrix size and sparsity + Index nnz = m_matrix.nonZeros(); + m_flops = static_cast(nnz * std::log(static_cast(m_nRows))); + } + } + + m_isInitialized = true; + return *this; +} + +template +void AccelerateImpl::release() { + // Clean up factorizations + m_numericFactorization.reset(nullptr); + m_symbolicFactorization.reset(nullptr); + + // Clear storage vectors + m_factorStorage.clear(); + m_workspace.clear(); + m_solve_workspace.clear(); + m_r_mem.clear(); + + // Clear matrix data + m_matrix.resize(0, 0); + m_matrix.data().squeeze(); + + // Clear permutation + m_permutation.clear(); + + // Reset cached sizes + m_cached_solve_workspace_size = 0; + + // Reset performance metrics + m_flops = 0; + m_mem = 0; + + // Reset initialization state + m_isInitialized = false; + m_info = Success; +} + +} // end namespace Eigen + +#endif // EIGEN_ACCELERATESUPPORT_H diff --git a/src/Solvers/AccelerateUtils.h b/src/Solvers/AccelerateUtils.h new file mode 100644 index 00000000..a94225b6 --- /dev/null +++ b/src/Solvers/AccelerateUtils.h @@ -0,0 +1,11 @@ +#pragma once + +#include +#include + +// Sets the number of threads for Accelerate via the environment variable VECLIB_MAXIMUM_THREADS. +// Currently, the VECLIB_MAXIMUM_THREADS is the only way to specify the number of threads +// Accelerate will use. This should be swapped to an Accelerate API call if Apple ever adds one. +inline void accelerate_set_num_threads(int num_threads) { + setenv("VECLIB_MAXIMUM_THREADS", std::to_string(num_threads).c_str(), 1); +} diff --git a/src/Solvers/Jet.h b/src/Solvers/Jet.h index 2320750b..d1f91670 100644 --- a/src/Solvers/Jet.h +++ b/src/Solvers/Jet.h @@ -1,6 +1,10 @@ #pragma once #include "OptimizationProblemBase.h" +#ifdef USE_ACCELERATE_SPARSE +#include "AccelerateUtils.h" +#else #include "mkl.h" +#endif namespace ASSET { @@ -97,6 +101,10 @@ namespace ASSET { int nt, bool verbose) { +#ifdef USE_ACCELERATE_SPARSE + accelerate_set_num_threads(1); +#endif + int NumJobs = args.size(); int NumConv = 0; @@ -110,7 +118,9 @@ namespace ASSET { Utils::Timer t; auto Job = [&](int threadid, int i) { +#ifndef USE_ACCELERATE_SPARSE mkl_set_num_threads_local(1); +#endif int gfidx = genfidxes[i]; if constexpr (std::is_same_v) { @@ -218,4 +228,4 @@ namespace ASSET { }; -} // namespace ASSET \ No newline at end of file +} // namespace ASSET diff --git a/src/Solvers/PSIOPT.cpp b/src/Solvers/PSIOPT.cpp index ba44f490..ad7fb742 100644 --- a/src/Solvers/PSIOPT.cpp +++ b/src/Solvers/PSIOPT.cpp @@ -1,6 +1,8 @@ #include "PSIOPT.h" +#ifndef USE_ACCELERATE_SPARSE #include +#endif #include "PyDocString/Solvers/PSIOPT_doc.h" @@ -12,10 +14,19 @@ void ASSET::PSIOPT::setNLP(std::shared_ptr np) { this->SlackVars = this->nlp->SlackVars; this->KKTdim = this->nlp->KKTdim; this->setQPParams(); +#ifdef USE_ACCELERATE_SPARSE + accelerate_set_num_threads(QPThreads); +#else mkl_set_num_threads(QPThreads); +#endif this->nlp->analyzeSparsity(this->KKTSol.getMatrix()); +#ifdef USE_ACCELERATE_SPARSE + // we need to call this to update the internal AccelSparseMatrix since + // we changed the sparsity pattern via the reference returned from getMatrix. + this->KKTSol.reinitializeInternalMatrixRepresentation(); +#endif if (storespmat) spmat = this->KKTSol.getMatrix(); this->QPanalyzed = false; diff --git a/src/Solvers/PSIOPT.h b/src/Solvers/PSIOPT.h index 4bd5a367..1075b8f2 100644 --- a/src/Solvers/PSIOPT.h +++ b/src/Solvers/PSIOPT.h @@ -2,7 +2,13 @@ #pragma once #include "IterateInfo.h" #include "NonLinearProgram.h" + +#ifdef USE_ACCELERATE_SPARSE +#include "AccelerateInterface.h" +#else #include "PardisoInterface.h" +#endif + #include "Utils/ColorText.h" #include "pch.h" @@ -125,7 +131,11 @@ namespace ASSET { int InequalCons = 0; int KKTdim = 0; +#ifdef USE_ACCELERATE_SPARSE + Eigen::AccelerateLDLTTPP, Eigen::Upper> KKTSol; +#else Eigen::PardisoLDLT, Eigen::Upper> KKTSol; +#endif int QPThreads = ASSET_DEFAULT_QP_THREADS; QPAlgModes QPAlg = QPAlgModes::Classic; @@ -475,6 +485,21 @@ namespace ASSET { } void setQPParams() { +#ifdef USE_ACCELERATE_SPARSE + // Accelerate interface uses different configuration methods + switch(QPOrd) { + case MINDEG: + this->KKTSol.setOrder(SparseOrderAMD); + break; + case METIS: + case PARMETIS: + // Note next version of Apple Accelerate will be providing a parallel Metis factorization + this->KKTSol.setOrder(SparseOrderMetis); + } + this->KKTSol.setNumThreads(QPThreads); + this->KKTSol.setIterativeRefinement(QPRefSteps > 0); + this->KKTSol.setIterativeRefinementIterations(QPRefSteps); +#else this->KKTSol.m_ord = QPOrd; this->KKTSol.m_pivotstrat = QPPivotStrategy; this->KKTSol.m_pivotpert = QPPivotPerturb; @@ -487,8 +512,8 @@ namespace ASSET { if (this->CNRMode) this->KKTSol.m_threads = this->QPThreads; this->KKTSol.m_parsolve = this->QPParSolve; - // mkl_set_num_threads(QPThreads); this->KKTSol.setParams(); +#endif } void set_early_callback(const EarlyCallBackType& f) { diff --git a/src/TypeDefs/EigenTypes.h b/src/TypeDefs/EigenTypes.h index 7f91a6c9..6a5a6668 100644 --- a/src/TypeDefs/EigenTypes.h +++ b/src/TypeDefs/EigenTypes.h @@ -72,7 +72,11 @@ using DomainMatrix = Eigen::Matrix; template using SuperScalarType = Eigen::Array; +#ifdef __ARM_NEON +using DefaultSuperScalar = SuperScalarType; +#else using DefaultSuperScalar = SuperScalarType; +#endif } // namespace ASSET diff --git a/src/main.cpp b/src/main.cpp index 470ba11a..ceb75fe7 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -34,12 +34,17 @@ void SoftwareInfo() { std::string(ASSET_COMPILERSTRING) + std::string(" ") + std::string(ASSET_COMPILERVERSION); std::string pythonv = std::to_string(PY_MAJOR_VERSION) + "." + std::to_string(PY_MINOR_VERSION); std::string vecversion; - if (vsize == 2) + if (vsize == 2) { +#ifdef __ARM_NEON + vecversion = "ARM NEON - 128 bit - 2 doubles"; +#else vecversion = "SSE - 128 bit - 2 doubles"; - else if (vsize == 4) +#endif + } else if (vsize == 4) { vecversion = "AVX2 - 256 bit - 4 doubles"; - else if (vsize == 8) + } else if (vsize == 8) { vecversion = "AVX512 - 512 bit - 8 doubles"; + } std::string ASSET_STR(" ___ _____ _____ ______ ______ \n" @@ -91,7 +96,11 @@ void SoftwareInfo() { fmt::print(fmt::fg(fmt::color::white), vecversion); fmt::print("\n"); fmt::print(fmt::fg(fmt::color::royal_blue), " Linear Solver : "); + #ifndef USE_ACCELERATE_SPARSE fmt::print(fmt::fg(fmt::color::white), "Intel MKL Pardiso"); + #else + fmt::print(fmt::fg(fmt::color::white), "Apple Accelerate Sparse"); + #endif fmt::print("\n"); fmt::print(fmt::fg(fmt::color::royal_blue), " Compiled With : ");