From b1e484fb294b2759d3b6b1f68ca0bf5e255b87d1 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Tue, 25 Feb 2025 16:39:24 -0500 Subject: [PATCH 1/5] :reformatting --- .clang-format | 8 + .gersemirc | 3 + .git-blame-ignore-revs | 2 +- .gitattributes | 2 +- .pre-commit-config.yaml | 10 +- .readthedocs.yaml | 2 +- CMakeLists.txt | 440 ++++++++---------- cmake/setupCPM.cmake | 20 +- cmake/setupDUCC.cmake | 87 ++-- cmake/setupFFTW.cmake | 164 +++---- cmake/setupSphinx.cmake | 28 +- cmake/setupXSIMD.cmake | 56 +-- cmake/utils.cmake | 144 +++--- contributing.md | 2 +- devel/CMakeLists.txt | 23 +- devel/GuruInterfaceBranch_BrainDump | 38 +- devel/agenda_11-27-23.txt | 5 +- devel/check_dumbinputs.cmake | 66 +-- .../compare_foldrescale_PR440_laptop5700U.txt | 7 +- .../cuda/cufinufft_tasks_meeting_Jun2023.txt | 5 - devel/eval_ker_expts.cpp | 2 +- devel/finufft_meeting-7-5-23.txt | 1 - devel/foldrescale_spreadtest1d_slower.txt | 1 - devel/gen_all_horner_cpp_header.m | 8 +- devel/ker_ppval_coeff_mat.m | 3 +- devel/padding.cpp | 2 +- devel/plans_fall23.txt | 1 - devel/reverse_engineer_tol.m | 4 +- devel/test_ker_ppval.cpp | 6 +- devel/v2spec.md | 4 +- docs/Makefile | 2 +- docs/c.rst | 7 +- docs/c1d.doc | 84 ++-- docs/c1d.docsrc | 4 +- docs/c2d.doc | 78 ++-- docs/c3d.doc | 98 ++-- docs/c3d.docsrc | 2 +- docs/cguru.doc | 68 +-- docs/cguru.docsrc | 4 +- docs/install_gpu.rst | 4 +- docs/latexindex.rst | 10 +- docs/makefile.doc | 4 +- docs/math.rst | 12 +- docs/matlab.rst | 6 +- docs/nfft_migr.rst | 2 +- docs/refs.rst | 1 - docs/tut.rst | 5 +- docs/tutorial/contft.rst | 15 +- docs/tutorial/grf.rst | 4 +- docs/tutorial/peripois2d.rst | 20 +- docs/tutorial/serieseval.rst | 19 +- examples/CMakeLists.txt | 47 +- examples/cuda/CMakeLists.txt | 12 +- fortran/CMakeLists.txt | 31 +- fortran/cmcl_license.txt | 8 +- fortran/directft/README | 2 +- fortran/directft/dirft1d.f | 14 +- fortran/directft/dirft1df.f | 12 +- fortran/directft/dirft2d.f | 8 +- fortran/directft/dirft2df.f | 8 +- fortran/directft/dirft3d.f | 13 +- fortran/directft/dirft3df.f | 13 +- fortran/directft/prini.f | 2 +- fortran/examples/guru1d1.f | 12 +- fortran/examples/guru1d1f.f | 8 +- fortran/examples/nufft1d_demo.f | 2 +- fortran/examples/nufft2d_demo.f | 4 +- fortran/examples/nufft2d_demof.f | 4 +- fortran/examples/nufft2dmany_demo.f | 8 +- fortran/examples/nufft2dmany_demof.f | 8 +- fortran/examples/nufft3d_demo.f | 4 +- fortran/examples/nufft3d_demof.f | 6 +- fortran/examples/simple1d1.f | 10 +- fortran/examples/simple1d1.f90 | 20 +- fortran/examples/simple1d1f.f | 10 +- include/cufinufft/contrib/helper_cuda.h | 2 +- include/cufinufft/impl.h | 66 +-- include/cufinufft/spreadinterp.h | 2 +- include/cufinufft/types.h | 6 +- matlab/CMakeLists.txt | 7 +- matlab/finufft1d1.m | 1 - matlab/finufft1d2.m | 1 - matlab/finufft1d3.m | 1 - matlab/finufft2d1.m | 1 - matlab/finufft2d2.m | 1 - matlab/finufft2d3.m | 1 - matlab/finufft3d1.m | 1 - matlab/finufft3d2.m | 1 - matlab/finufft3d3.m | 2 - matlab/finufft_plan.docsrc | 3 +- matlab/finufft_plan.m | 1 - matlab/notes.docbit | 1 - matlab/test/check_modeords.m | 2 +- matlab/test/fig_accuracy.m | 1 - matlab/test/guru_setpts_issue.m | 2 +- matlab/valid_setpts.m | 4 +- perftest/CMakeLists.txt | 32 +- perftest/checkGuruTiming.sh | 4 +- perftest/compare_spreads.jl | 6 +- perftest/cuda/CMakeLists.txt | 15 +- perftest/cuda/cuperftest.cu | 4 +- perftest/mymaxthreads.sh | 2 +- perftest/mynumcores.sh | 2 +- perftest/results/gcc_vs_icc_xeon.txt | 3 +- perftest/searchForTimeMetrics.py | 6 +- perftest/spreadbenchmark.py | 22 +- perftest/spreadingSchemeStats.py | 20 +- perftest/spreadtestnd.sh | 2 +- perftest/timingBreakdowns.py | 70 ++- .../timingBreakdowns_largeProblems.out | 2 +- ...eakdowns_smallProblems_SequentialMulti.out | 3 +- ...smallProblems_SequentialMulti_noSwitch.out | 2 +- ...downs_smallProblems_SimultaneousSingle.out | 3 +- python/CMakeLists.txt | 24 +- python/finufft/test/accuracy_speed_tests.py | 2 +- src/cuda/1d/README | 12 +- src/cuda/2d/README | 12 +- src/cuda/3d/README | 14 +- src/cuda/CMakeLists.txt | 56 ++- src/cuda/README | 2 +- src/cuda/memtransfer_wrapper.cu | 7 +- test/CMakeLists.txt | 45 +- test/cuda/CMakeLists.txt | 129 ++--- test/spreadinterp1d_test.cpp | 2 +- tutorial/README | 1 - tutorial/inv1d2.m | 2 +- tutorial/serieseval1d.m | 2 +- tutorial/utils/lgwt.m | 46 +- 128 files changed, 1190 insertions(+), 1330 deletions(-) create mode 100644 .gersemirc diff --git a/.clang-format b/.clang-format index 0488353ca..e71296a29 100644 --- a/.clang-format +++ b/.clang-format @@ -30,4 +30,12 @@ SpaceAfterLogicalNot: false SpaceAfterTemplateKeyword: false TabWidth: 2 UseTab: Never +AttributeMacros: ['__host__', '__device__', '__global__', '__forceinline__'] +QualifierOrder: + - static + - inline + - constexpr + - const + - type +QualifierAlignment: Custom ... diff --git a/.gersemirc b/.gersemirc new file mode 100644 index 000000000..d422a5f81 --- /dev/null +++ b/.gersemirc @@ -0,0 +1,3 @@ +definitions: + - "./cmake" +line_length: 120 diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs index 1e469ec95..72361d1e3 100644 --- a/.git-blame-ignore-revs +++ b/.git-blame-ignore-revs @@ -1,2 +1,2 @@ # Applied clang format to the codebase -884ba427be0c60aa3399d5ea71b0e9e3a7cbf686 \ No newline at end of file +884ba427be0c60aa3399d5ea71b0e9e3a7cbf686 diff --git a/.gitattributes b/.gitattributes index dcd444fbe..d11d273f7 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,2 +1,2 @@ # Convert to LF line endings on checkout. -*.sh text eol=lf \ No newline at end of file +*.sh text eol=lf diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f935d780a..a2c1f8de3 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -4,6 +4,7 @@ repos: hooks: - id: clang-format types_or: [c++, c, cuda] + exclude: '(^|/)(matlab/.*)$' - repo: https://github.com/pre-commit/pre-commit-hooks rev: v5.0.0 hooks: @@ -13,6 +14,13 @@ repos: - id: check-illegal-windows-names - id: mixed-line-ending - repo: https://github.com/BlankSpruce/gersemi - rev: 0.18.2 + rev: 0.19.1 hooks: - id: gersemi + - repo: https://github.com/abravalheri/validate-pyproject + rev: v0.23 # Use the latest stable version + hooks: + - id: validate-pyproject + # Optional: Include additional validations from SchemaStore + additional_dependencies: ["validate-pyproject-schema-store[all]"] + files: ^python/(finufft|cufinufft)/pyproject\.toml$ diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 0f63d4f34..5f990bbf8 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -10,7 +10,7 @@ build: os: ubuntu-22.04 tools: python: "3.11" - + # Build all formats formats: all diff --git a/CMakeLists.txt b/CMakeLists.txt index 2c6284195..6a9d93d93 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,15 +1,12 @@ cmake_minimum_required(VERSION 3.19...3.30) -project( - FINUFFT - VERSION 2.4.0 - LANGUAGES C CXX) +project(FINUFFT VERSION 2.4.0 LANGUAGES C CXX) # windows MSVC runtime flags policy cmake_policy(SET CMP0091 NEW) include(CMakeDependentOption) -# cmake-format: off +# gersemi: off # All options go here sphinx tag (don't remove): @cmake_opts_start option(FINUFFT_BUILD_FORTRAN "Whether to build the FINUFFT Fortran examples" OFF) option(FINUFFT_BUILD_MATLAB "Whether to build the FINUFFT Matlab interface" OFF) @@ -36,11 +33,11 @@ set(FINUFFT_ARCH_FLAGS "native" CACHE STRING "Compiler flags for specifying targ cmake_dependent_option(FINUFFT_ENABLE_INSTALL "Disable installation in the case of python builds" ON "NOT FINUFFT_BUILD_PYTHON" OFF) cmake_dependent_option(FINUFFT_STATIC_LINKING "Disable static libraries in the case of python builds" ON "NOT FINUFFT_BUILD_PYTHON" OFF) cmake_dependent_option(FINUFFT_SHARED_LINKING "Shared should be the opposite of static linking" ON "NOT FINUFFT_STATIC_LINKING" OFF) -# cmake-format: on +# gersemi: on # When building shared libraries, we need to build with -fPIC in all cases if(FINUFFT_SHARED_LINKING) - set(FINUFFT_POSITION_INDEPENDENT_CODE ON) + set(FINUFFT_POSITION_INDEPENDENT_CODE ON) endif() include(cmake/utils.cmake) @@ -67,10 +64,10 @@ set(FINUFFT_CXX_FLAGS_RELEASE /Ob /Oi /Ot - /Oy) + /Oy +) -filter_supported_compiler_flags(FINUFFT_CXX_FLAGS_RELEASE - FINUFFT_CXX_FLAGS_RELEASE) +filter_supported_compiler_flags(FINUFFT_CXX_FLAGS_RELEASE FINUFFT_CXX_FLAGS_RELEASE) message(STATUS "FINUFFT Release flags: ${FINUFFT_CXX_FLAGS_RELEASE}") set(FINUFFT_CXX_FLAGS_RELWITHDEBINFO ${FINUFFT_CXX_FLAGS_RELEASE}) @@ -81,58 +78,49 @@ set(FINUFFT_CXX_FLAGS_DEBUG -ggdb3 -Wall -Wno-sign-compare - -Wno-unknown-pragmas) + -Wno-unknown-pragmas +) if(DEFINED ENV{GITHUB_ACTIONS}) - if($ENV{GITHUB_ACTIONS} STREQUAL "true") - message("CMake is being executed inside a GitHub workflow") - # if msvc use FS flag - if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC" AND NOT CMAKE_BUILD_TYPE STREQUAL - "Release") - set(CMAKE_MSVC_DEBUG_INFORMATION_FORMAT - "$<$:Embedded>") - message("CMAKE_MSVC_DEBUG_INFORMATION_FORMAT TO Embedded") + if($ENV{GITHUB_ACTIONS} STREQUAL "true") + message("CMake is being executed inside a GitHub workflow") + # if msvc use FS flag + if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC" AND NOT CMAKE_BUILD_TYPE STREQUAL "Release") + set(CMAKE_MSVC_DEBUG_INFORMATION_FORMAT "$<$:Embedded>") + message("CMAKE_MSVC_DEBUG_INFORMATION_FORMAT TO Embedded") + endif() endif() - endif() else() - message("CMake is NOT being executed inside a GitHub workflow") - # env variable is: - message(STATUS "ENV{GITHUB_ACTIONS}: $ENV{GITHUB_ACTIONS}") + message("CMake is NOT being executed inside a GitHub workflow") + # env variable is: + message(STATUS "ENV{GITHUB_ACTIONS}: $ENV{GITHUB_ACTIONS}") endif() filter_supported_compiler_flags(FINUFFT_CXX_FLAGS_DEBUG FINUFFT_CXX_FLAGS_DEBUG) message(STATUS "FINUFFT Debug flags: ${FINUFFT_CXX_FLAGS_DEBUG}") -list(APPEND FINUFFT_CXX_FLAGS_RELWITHDEBINFO ${FINUFFT_CXX_FLAGS_RELEASE} - ${FINUFFT_CXX_FLAGS_DEBUG}) -message( - STATUS "FINUFFT RelWithDebInfo flags: ${FINUFFT_CXX_FLAGS_RELWITHDEBINFO}") +list(APPEND FINUFFT_CXX_FLAGS_RELWITHDEBINFO ${FINUFFT_CXX_FLAGS_RELEASE} ${FINUFFT_CXX_FLAGS_DEBUG}) +message(STATUS "FINUFFT RelWithDebInfo flags: ${FINUFFT_CXX_FLAGS_RELWITHDEBINFO}") if(FINUFFT_ARCH_FLAGS STREQUAL "native") - set(FINUFFT_ARCH_FLAGS - -march=native - CACHE STRING "" FORCE) - filter_supported_compiler_flags(FINUFFT_ARCH_FLAGS FINUFFT_ARCH_FLAGS) - if(NOT FINUFFT_ARCH_FLAGS) - set(FINUFFT_ARCH_FLAGS - -mtune=native - CACHE STRING "" FORCE) + set(FINUFFT_ARCH_FLAGS -march=native CACHE STRING "" FORCE) filter_supported_compiler_flags(FINUFFT_ARCH_FLAGS FINUFFT_ARCH_FLAGS) - endif() - if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") - # -march=native emulation for MSVC - check_arch_support() - endif() - if(NOT FINUFFT_ARCH_FLAGS) - message(WARNING "No architecture flags are supported by the compiler.") - else() - message(STATUS "FINUFFT Arch flags: ${FINUFFT_ARCH_FLAGS}") - endif() + if(NOT FINUFFT_ARCH_FLAGS) + set(FINUFFT_ARCH_FLAGS -mtune=native CACHE STRING "" FORCE) + filter_supported_compiler_flags(FINUFFT_ARCH_FLAGS FINUFFT_ARCH_FLAGS) + endif() + if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") + # -march=native emulation for MSVC + check_arch_support() + endif() + if(NOT FINUFFT_ARCH_FLAGS) + message(WARNING "No architecture flags are supported by the compiler.") + else() + message(STATUS "FINUFFT Arch flags: ${FINUFFT_ARCH_FLAGS}") + endif() endif() # Set default build type to Release if(NOT CMAKE_BUILD_TYPE) - set(CMAKE_BUILD_TYPE - Release - CACHE STRING "Set the default build type to Release" FORCE) + set(CMAKE_BUILD_TYPE Release CACHE STRING "Set the default build type to Release" FORCE) endif() # This set of sources is compiled twice, once in single precision and once in @@ -142,223 +130,206 @@ set(FINUFFT_PRECISION_DEPENDENT_SOURCES) # If we're building for Fortran, make sure we also include the translation # layer. if(FINUFFT_BUILD_FORTRAN) - list(APPEND FINUFFT_PRECISION_DEPENDENT_SOURCES fortran/finufftfort.cpp) + list(APPEND FINUFFT_PRECISION_DEPENDENT_SOURCES fortran/finufftfort.cpp) endif() # set linker flags for sanitizer set(FINUFFT_SANITIZER_FLAGS) if(FINUFFT_ENABLE_SANITIZERS) - set(FINUFFT_SANITIZER_FLAGS -fsanitize=address -fsanitize=undefined - -fsanitize=bounds-strict /fsanitize=address /RTC1) - filter_supported_compiler_flags(FINUFFT_SANITIZER_FLAGS - FINUFFT_SANITIZER_FLAGS) - set(FINUFFT_SANITIZER_FLAGS - $<$:${FINUFFT_SANITIZER_FLAGS}>) + set(FINUFFT_SANITIZER_FLAGS + -fsanitize=address + -fsanitize=undefined + -fsanitize=bounds-strict + /fsanitize=address + /RTC1 + ) + filter_supported_compiler_flags(FINUFFT_SANITIZER_FLAGS FINUFFT_SANITIZER_FLAGS) + set(FINUFFT_SANITIZER_FLAGS $<$:${FINUFFT_SANITIZER_FLAGS}>) endif() # Utility function to enable ASAN on debug builds function(enable_asan target) - target_compile_options(${target} PRIVATE ${FINUFFT_SANITIZER_FLAGS}) - if(NOT (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")) - target_link_options(${target} PRIVATE ${FINUFFT_SANITIZER_FLAGS}) - endif() + target_compile_options(${target} PRIVATE ${FINUFFT_SANITIZER_FLAGS}) + if(NOT (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")) + target_link_options(${target} PRIVATE ${FINUFFT_SANITIZER_FLAGS}) + endif() endfunction() set(CPM_DOWNLOAD_VERSION 0.40.5) include(cmake/setupCPM.cmake) if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME) - include(CTest) - if(FINUFFT_BUILD_TESTS) - enable_testing() - endif() - if(FINUFFT_BUILD_DOCS) - include(cmake/setupSphinx.cmake) - endif() + include(CTest) + if(FINUFFT_BUILD_TESTS) + enable_testing() + endif() + if(FINUFFT_BUILD_DOCS) + include(cmake/setupSphinx.cmake) + endif() endif() if(FINUFFT_USE_CPU) - # make apple with gnu use old linker, new linker breaks, see issue #360 - if((APPLE) AND (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")) - add_link_options("-ld_classic") - endif() - set(FFTW_VERSION 3.3.10) - set(XTL_VERSION 0.7.7) - set(XSIMD_VERSION 13.1.0) - set(DUCC0_VERSION ducc0_0_36_0) - set(FINUFFT_FFTW_LIBRARIES) - include(cmake/setupXSIMD.cmake) - if(FINUFFT_USE_DUCC0) - include(cmake/setupDUCC.cmake) - else() - include(cmake/setupFFTW.cmake) - endif() - if(FINUFFT_USE_DUCC0) - set(FINUFFT_FFTLIBS ducc0) - else() - set(FINUFFT_FFTLIBS ${FINUFFT_FFTW_LIBRARIES}) - endif() - if(FINUFFT_USE_OPENMP) - find_package( - OpenMP - COMPONENTS C CXX - REQUIRED) - endif() + # make apple with gnu use old linker, new linker breaks, see issue #360 + if((APPLE) AND (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")) + add_link_options("-ld_classic") + endif() + set(FFTW_VERSION 3.3.10) + set(XTL_VERSION 0.7.7) + set(XSIMD_VERSION 13.1.0) + set(DUCC0_VERSION ducc0_0_36_0) + set(FINUFFT_FFTW_LIBRARIES) + include(cmake/setupXSIMD.cmake) + if(FINUFFT_USE_DUCC0) + include(cmake/setupDUCC.cmake) + else() + include(cmake/setupFFTW.cmake) + endif() + if(FINUFFT_USE_DUCC0) + set(FINUFFT_FFTLIBS ducc0) + else() + set(FINUFFT_FFTLIBS ${FINUFFT_FFTW_LIBRARIES}) + endif() + if(FINUFFT_USE_OPENMP) + find_package(OpenMP COMPONENTS C CXX REQUIRED) + endif() endif() # check if -Wno-deprecated-declarations is supported -check_cxx_compiler_flag(-Wno-deprecated-declarations - FINUFFT_HAS_NO_DEPRECATED_DECLARATIONS) +check_cxx_compiler_flag(-Wno-deprecated-declarations FINUFFT_HAS_NO_DEPRECATED_DECLARATIONS) # Utility function to link static/dynamic lib function(finufft_link_test target) - if(FINUFFT_USE_DUCC0) - target_compile_definitions(${target} PRIVATE FINUFFT_USE_DUCC0) - endif() - target_link_libraries(${target} PRIVATE finufft xsimd ${FINUFFT_FFTLIBS}) - if(FINUFFT_USE_OPENMP) - target_link_libraries(${target} PRIVATE OpenMP::OpenMP_CXX) - target_link_options(${target} PRIVATE ${OpenMP_CXX_FLAGS}) - endif() - enable_asan(${target}) - target_compile_features(${target} PRIVATE cxx_std_17) - set_target_properties( - ${target} PROPERTIES MSVC_RUNTIME_LIBRARY - "MultiThreaded$<$:Debug>") - # disable deprecated warnings for tests if supported - if(FINUFFT_HAS_NO_DEPRECATED_DECLARATIONS) - target_compile_options(${target} PRIVATE -Wno-deprecated-declarations) - endif() + if(FINUFFT_USE_DUCC0) + target_compile_definitions(${target} PRIVATE FINUFFT_USE_DUCC0) + endif() + target_link_libraries(${target} PRIVATE finufft xsimd ${FINUFFT_FFTLIBS}) + if(FINUFFT_USE_OPENMP) + target_link_libraries(${target} PRIVATE OpenMP::OpenMP_CXX) + target_link_options(${target} PRIVATE ${OpenMP_CXX_FLAGS}) + endif() + enable_asan(${target}) + target_compile_features(${target} PRIVATE cxx_std_17) + set_target_properties(${target} PROPERTIES MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>") + # disable deprecated warnings for tests if supported + if(FINUFFT_HAS_NO_DEPRECATED_DECLARATIONS) + target_compile_options(${target} PRIVATE -Wno-deprecated-declarations) + endif() endfunction() # Utility function to set finufft compilation options. function(set_finufft_options target) - target_compile_features(${target} PRIVATE cxx_std_17) - target_compile_options( - ${target} PRIVATE $<$:${FINUFFT_ARCH_FLAGS}>) - target_compile_options( - ${target} PRIVATE $<$:${FINUFFT_CXX_FLAGS_RELEASE}>) - target_compile_options( - ${target} - PRIVATE $<$:${FINUFFT_CXX_FLAGS_RELWITHDEBINFO}>) - target_compile_options(${target} - PRIVATE $<$:${FINUFFT_CXX_FLAGS_DEBUG}>) - target_include_directories( - ${target} PUBLIC $) - target_include_directories( - ${target} SYSTEM - INTERFACE $) - set_target_properties( - ${target} - PROPERTIES MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>" - POSITION_INDEPENDENT_CODE ${FINUFFT_POSITION_INDEPENDENT_CODE}) - enable_asan(${target}) - if(FINUFFT_USE_OPENMP) - target_link_libraries(${target} PRIVATE OpenMP::OpenMP_CXX) - target_link_options(${target} PRIVATE ${OpenMP_CXX_FLAGS}) - endif() - if(FINUFFT_USE_DUCC0) - target_compile_definitions(${target} PRIVATE FINUFFT_USE_DUCC0) - endif() - target_link_libraries(${target} PRIVATE xsimd) - target_link_libraries(${target} PRIVATE ${FINUFFT_FFTLIBS}) + target_compile_features(${target} PRIVATE cxx_std_17) + target_compile_options(${target} PRIVATE $<$:${FINUFFT_ARCH_FLAGS}>) + target_compile_options(${target} PRIVATE $<$:${FINUFFT_CXX_FLAGS_RELEASE}>) + target_compile_options(${target} PRIVATE $<$:${FINUFFT_CXX_FLAGS_RELWITHDEBINFO}>) + target_compile_options(${target} PRIVATE $<$:${FINUFFT_CXX_FLAGS_DEBUG}>) + target_include_directories(${target} PUBLIC $) + target_include_directories(${target} SYSTEM INTERFACE $) + set_target_properties( + ${target} + PROPERTIES + MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>" + POSITION_INDEPENDENT_CODE ${FINUFFT_POSITION_INDEPENDENT_CODE} + ) + enable_asan(${target}) + if(FINUFFT_USE_OPENMP) + target_link_libraries(${target} PRIVATE OpenMP::OpenMP_CXX) + target_link_options(${target} PRIVATE ${OpenMP_CXX_FLAGS}) + endif() + if(FINUFFT_USE_DUCC0) + target_compile_definitions(${target} PRIVATE FINUFFT_USE_DUCC0) + endif() + target_link_libraries(${target} PRIVATE xsimd) + target_link_libraries(${target} PRIVATE ${FINUFFT_FFTLIBS}) endfunction() if(FINUFFT_USE_CPU) - set(FINUFFT_SOURCES - src/spreadinterp.cpp - contrib/legendre_rule_fast.cpp - src/fft.cpp - src/finufft_core.cpp - src/c_interface.cpp - src/finufft_utils.cpp - fortran/finufftfort.cpp) - # Main finufft libraries - if(NOT FINUFFT_STATIC_LINKING) - add_library(finufft SHARED ${FINUFFT_SOURCES}) - else() - add_library(finufft STATIC ${FINUFFT_SOURCES}) - endif() - set_finufft_options(finufft) - - if(WIN32 AND FINUFFT_SHARED_LINKING) - target_compile_definitions(finufft PRIVATE dll_EXPORTS FINUFFT_DLL) - endif() - find_library(MATH_LIBRARY m) - if(MATH_LIBRARY) - target_link_libraries(finufft PRIVATE ${MATH_LIBRARY}) - endif() - target_include_directories( - finufft PUBLIC $) - target_include_directories( - finufft SYSTEM - INTERFACE $) - if(FINUFFT_ENABLE_INSTALL) - file(GLOB FINUFFT_PUBLIC_HEADERS - "${CMAKE_CURRENT_SOURCE_DIR}/include/finufft*.h") - set_target_properties(finufft PROPERTIES PUBLIC_HEADER - "${FINUFFT_PUBLIC_HEADERS}") - endif() - list(APPEND INSTALL_TARGETS finufft) + set(FINUFFT_SOURCES + src/spreadinterp.cpp + contrib/legendre_rule_fast.cpp + src/fft.cpp + src/finufft_core.cpp + src/c_interface.cpp + src/finufft_utils.cpp + fortran/finufftfort.cpp + ) + # Main finufft libraries + if(NOT FINUFFT_STATIC_LINKING) + add_library(finufft SHARED ${FINUFFT_SOURCES}) + else() + add_library(finufft STATIC ${FINUFFT_SOURCES}) + endif() + set_finufft_options(finufft) + + if(WIN32 AND FINUFFT_SHARED_LINKING) + target_compile_definitions(finufft PRIVATE dll_EXPORTS FINUFFT_DLL) + endif() + find_library(MATH_LIBRARY m) + if(MATH_LIBRARY) + target_link_libraries(finufft PRIVATE ${MATH_LIBRARY}) + endif() + target_include_directories(finufft PUBLIC $) + target_include_directories(finufft SYSTEM INTERFACE $) + if(FINUFFT_ENABLE_INSTALL) + file(GLOB FINUFFT_PUBLIC_HEADERS "${CMAKE_CURRENT_SOURCE_DIR}/include/finufft*.h") + set_target_properties(finufft PROPERTIES PUBLIC_HEADER "${FINUFFT_PUBLIC_HEADERS}") + endif() + list(APPEND INSTALL_TARGETS finufft) endif() if(FINUFFT_USE_CUDA) - if(NOT DEFINED FINUFFT_CUDA_ARCHITECTURES) - if(DEFINED CMAKE_CUDA_ARCHITECTURES) - set(FINUFFT_CUDA_ARCHITECTURES "{$CMAKE_CUDA_ARCHITECTURES}") - else() - message( - "FINUFFT WARNING: No CUDA architecture supplied via '-DFINUFFT_CUDA_ARCHITECTURES=...', defaulting to 'native'" - ) - message( - "See: https://developer.nvidia.com/cuda-gpus for more details on what architecture to supply." - ) + if(NOT DEFINED FINUFFT_CUDA_ARCHITECTURES) + if(DEFINED CMAKE_CUDA_ARCHITECTURES) + set(FINUFFT_CUDA_ARCHITECTURES "{$CMAKE_CUDA_ARCHITECTURES}") + else() + message( + "FINUFFT WARNING: No CUDA architecture supplied via '-DFINUFFT_CUDA_ARCHITECTURES=...', defaulting to 'native'" + ) + message("See: https://developer.nvidia.com/cuda-gpus for more details on what architecture to supply.") + endif() + endif() + enable_language(CUDA) + find_package(CUDAToolkit REQUIRED) + add_subdirectory(src/cuda) + if(BUILD_TESTING AND FINUFFT_BUILD_TESTS) + add_subdirectory(perftest/cuda) + add_subdirectory(test/cuda) endif() - endif() - enable_language(CUDA) - find_package(CUDAToolkit REQUIRED) - add_subdirectory(src/cuda) - if(BUILD_TESTING AND FINUFFT_BUILD_TESTS) - add_subdirectory(perftest/cuda) - add_subdirectory(test/cuda) - endif() - - list(APPEND INSTALL_TARGETS cufinufft) + + list(APPEND INSTALL_TARGETS cufinufft) endif() # Add tests defined in their own directory -if(BUILD_TESTING - AND FINUFFT_USE_CPU - AND FINUFFT_BUILD_TESTS) - add_subdirectory(test) - add_subdirectory(perftest) +if(BUILD_TESTING AND FINUFFT_USE_CPU AND FINUFFT_BUILD_TESTS) + add_subdirectory(test) + add_subdirectory(perftest) endif() if(FINUFFT_BUILD_EXAMPLES AND FINUFFT_USE_CPU) - add_subdirectory(examples) + add_subdirectory(examples) endif() if(FINUFFT_BUILD_EXAMPLES AND FINUFFT_USE_CUDA) - add_subdirectory(examples/cuda) + add_subdirectory(examples/cuda) endif() if(FINUFFT_BUILD_FORTRAN) - enable_language(Fortran) - add_subdirectory(fortran) + enable_language(Fortran) + add_subdirectory(fortran) endif() if(FINUFFT_BUILD_MATLAB) - add_subdirectory(matlab) + add_subdirectory(matlab) endif() if(FINUFFT_BUILD_DEVEL) - add_subdirectory(devel) + add_subdirectory(devel) endif() if(FINUFFT_BUILD_PYTHON) - add_subdirectory(python) + add_subdirectory(python) endif() -# cmake-format: off +# gersemi: off message(STATUS "FINUFFT configuration summary:") message(STATUS " CMAKE_BUILD_TYPE: ${CMAKE_BUILD_TYPE}") message(STATUS " FINUFFT_USE_CPU: ${FINUFFT_USE_CPU}") @@ -377,31 +348,34 @@ message(STATUS " FINUFFT_FFTW_SUFFIX: ${FINUFFT_FFTW_SUFFIX}") message(STATUS " FINUFFT_FFTW_LIBRARIES: ${FINUFFT_FFTW_LIBRARIES}") message(STATUS " FINUFFT_ARCH_FLAGS: ${FINUFFT_ARCH_FLAGS}") message(STATUS " FINUFFT_USE_DUCC0: ${FINUFFT_USE_DUCC0}") -# cmake-format: on +# gersemi: on + if(FINUFFT_ENABLE_INSTALL) - include(GNUInstallDirs) - install(TARGETS ${INSTALL_TARGETS} PUBLIC_HEADER) - install(FILES ${PROJECT_SOURCE_DIR}/LICENSE - DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/licenses/finufft) - if(FINUFFT_USE_CPU) - install( - DIRECTORY ${PROJECT_SOURCE_DIR}/examples - DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/finufft - PATTERN "CMakeLists.txt" EXCLUDE - PATTERN "README" EXCLUDE - PATTERN "examples/cuda" EXCLUDE) - if(FINUFFT_BUILD_FORTRAN) - install(DIRECTORY ${PROJECT_SOURCE_DIR}/fortran/examples - DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/finufft/fortran) - install(FILES ${PROJECT_SOURCE_DIR}/include/finufft.fh - DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) + include(GNUInstallDirs) + install(TARGETS ${INSTALL_TARGETS} PUBLIC_HEADER) + install(FILES ${PROJECT_SOURCE_DIR}/LICENSE DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/licenses/finufft) + if(FINUFFT_USE_CPU) + install( + DIRECTORY ${PROJECT_SOURCE_DIR}/examples + DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/finufft + PATTERN "CMakeLists.txt" EXCLUDE + PATTERN "README" EXCLUDE + PATTERN "examples/cuda" EXCLUDE + ) + if(FINUFFT_BUILD_FORTRAN) + install( + DIRECTORY ${PROJECT_SOURCE_DIR}/fortran/examples + DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/finufft/fortran + ) + install(FILES ${PROJECT_SOURCE_DIR}/include/finufft.fh DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) + endif() + endif() + if(FINUFFT_USE_CUDA) + install( + DIRECTORY ${PROJECT_SOURCE_DIR}/examples/cuda + DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/finufft/examples + PATTERN "README" EXCLUDE + PATTERN "CMakeLists.txt" EXCLUDE + ) endif() - endif() - if(FINUFFT_USE_CUDA) - install( - DIRECTORY ${PROJECT_SOURCE_DIR}/examples/cuda - DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/finufft/examples - PATTERN "README" EXCLUDE - PATTERN "CMakeLists.txt" EXCLUDE) - endif() endif() diff --git a/cmake/setupCPM.cmake b/cmake/setupCPM.cmake index 610f1572b..49f696f67 100644 --- a/cmake/setupCPM.cmake +++ b/cmake/setupCPM.cmake @@ -1,21 +1,19 @@ # USING CPM TO HANDLE DEPENDENCIES if(CPM_SOURCE_CACHE) - set(CPM_DOWNLOAD_LOCATION - "${CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake") + set(CPM_DOWNLOAD_LOCATION "${CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake") elseif(DEFINED ENV{CPM_SOURCE_CACHE}) - set(CPM_DOWNLOAD_LOCATION - "$ENV{CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake") + set(CPM_DOWNLOAD_LOCATION "$ENV{CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake") else() - set(CPM_DOWNLOAD_LOCATION - "${CMAKE_BINARY_DIR}/cmake/CPM_${CPM_DOWNLOAD_VERSION}.cmake") + set(CPM_DOWNLOAD_LOCATION "${CMAKE_BINARY_DIR}/cmake/CPM_${CPM_DOWNLOAD_VERSION}.cmake") endif() if(NOT (EXISTS ${CPM_DOWNLOAD_LOCATION})) - message(STATUS "Downloading CPM.cmake to ${CPM_DOWNLOAD_LOCATION}") - file( - DOWNLOAD - https://github.com/cpm-cmake/CPM.cmake/releases/download/v${CPM_DOWNLOAD_VERSION}/CPM.cmake - ${CPM_DOWNLOAD_LOCATION}) + message(STATUS "Downloading CPM.cmake to ${CPM_DOWNLOAD_LOCATION}") + file( + DOWNLOAD + https://github.com/cpm-cmake/CPM.cmake/releases/download/v${CPM_DOWNLOAD_VERSION}/CPM.cmake + ${CPM_DOWNLOAD_LOCATION} + ) endif() include(${CPM_DOWNLOAD_LOCATION}) diff --git a/cmake/setupDUCC.cmake b/cmake/setupDUCC.cmake index 1d9f0f85c..89c3a093c 100644 --- a/cmake/setupDUCC.cmake +++ b/cmake/setupDUCC.cmake @@ -1,46 +1,47 @@ -cpmaddpackage( - NAME - ducc0 - GIT_REPOSITORY - https://gitlab.mpcdf.mpg.de/mtr/ducc.git - GIT_TAG - ${DUCC0_VERSION} - DOWNLOAD_ONLY - YES) +CPMAddPackage( + NAME + ducc0 + GIT_REPOSITORY + https://gitlab.mpcdf.mpg.de/mtr/ducc.git + GIT_TAG + ${DUCC0_VERSION} + DOWNLOAD_ONLY + YES +) if(ducc0_ADDED) - add_library( - ducc0 STATIC - ${ducc0_SOURCE_DIR}/src/ducc0/infra/string_utils.cc - ${ducc0_SOURCE_DIR}/src/ducc0/infra/threading.cc - ${ducc0_SOURCE_DIR}/src/ducc0/infra/mav.cc - ${ducc0_SOURCE_DIR}/src/ducc0/math/gridding_kernel.cc - ${ducc0_SOURCE_DIR}/src/ducc0/math/gl_integrator.cc) - target_include_directories(ducc0 PUBLIC ${ducc0_SOURCE_DIR}/src/) - target_compile_options( - ducc0 PRIVATE $<$:${FINUFFT_ARCH_FLAGS}>) - target_compile_options( - ducc0 PRIVATE $<$:${FINUFFT_CXX_FLAGS_RELEASE}>) - target_compile_options( - ducc0 - PRIVATE $<$:${FINUFFT_CXX_FLAGS_RELWITHDEBINFO}>) - target_compile_features(ducc0 PRIVATE cxx_std_17) - # private because we do not want to propagate this requirement - set_target_properties( - ducc0 - PROPERTIES MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>" - POSITION_INDEPENDENT_CODE ${FINUFFT_POSITION_INDEPENDENT_CODE}) - check_cxx_compiler_flag(-ffast-math HAS_FAST_MATH) - if(HAS_FAST_MATH) - target_compile_options(ducc0 PRIVATE -ffast-math) - endif() - check_cxx_compiler_flag(/fp:fast HAS_FP_FAST) - if(HAS_FP_FAST) - target_compile_options(ducc0 PRIVATE /fp:fast) - endif() - if(NOT OpenMP_CXX_FOUND) - find_package(Threads REQUIRED) - target_link_libraries(ducc0 PRIVATE Threads::Threads) - endif() - enable_asan(ducc0) + add_library( + ducc0 + STATIC + ${ducc0_SOURCE_DIR}/src/ducc0/infra/string_utils.cc + ${ducc0_SOURCE_DIR}/src/ducc0/infra/threading.cc + ${ducc0_SOURCE_DIR}/src/ducc0/infra/mav.cc + ${ducc0_SOURCE_DIR}/src/ducc0/math/gridding_kernel.cc + ${ducc0_SOURCE_DIR}/src/ducc0/math/gl_integrator.cc + ) + target_include_directories(ducc0 PUBLIC ${ducc0_SOURCE_DIR}/src/) + target_compile_options(ducc0 PRIVATE $<$:${FINUFFT_ARCH_FLAGS}>) + target_compile_options(ducc0 PRIVATE $<$:${FINUFFT_CXX_FLAGS_RELEASE}>) + target_compile_options(ducc0 PRIVATE $<$:${FINUFFT_CXX_FLAGS_RELWITHDEBINFO}>) + target_compile_features(ducc0 PRIVATE cxx_std_17) + # private because we do not want to propagate this requirement + set_target_properties( + ducc0 + PROPERTIES + MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>" + POSITION_INDEPENDENT_CODE ${FINUFFT_POSITION_INDEPENDENT_CODE} + ) + check_cxx_compiler_flag(-ffast-math HAS_FAST_MATH) + if(HAS_FAST_MATH) + target_compile_options(ducc0 PRIVATE -ffast-math) + endif() + check_cxx_compiler_flag(/fp:fast HAS_FP_FAST) + if(HAS_FP_FAST) + target_compile_options(ducc0 PRIVATE /fp:fast) + endif() + if(NOT OpenMP_CXX_FOUND) + find_package(Threads REQUIRED) + target_link_libraries(ducc0 PRIVATE Threads::Threads) + endif() + enable_asan(ducc0) endif() diff --git a/cmake/setupFFTW.cmake b/cmake/setupFFTW.cmake index e95f8885d..3e6d88063 100644 --- a/cmake/setupFFTW.cmake +++ b/cmake/setupFFTW.cmake @@ -1,87 +1,91 @@ -cpmaddpackage( - NAME - findfftw - GIT_REPOSITORY - "https://github.com/egpbos/findFFTW.git" - GIT_TAG - "master" - EXCLUDE_FROM_ALL - YES - GIT_SHALLOW - YES) +CPMAddPackage( + NAME + findfftw + GIT_REPOSITORY + "https://github.com/egpbos/findFFTW.git" + GIT_TAG + "master" + EXCLUDE_FROM_ALL + YES + GIT_SHALLOW + YES +) list(APPEND CMAKE_MODULE_PATH "${findfftw_SOURCE_DIR}") -if(FINUFFT_FFTW_LIBRARIES STREQUAL DEFAULT OR FINUFFT_FFTW_LIBRARIES STREQUAL - DOWNLOAD) - find_package(FFTW) - if((NOT FFTW_FOUND) OR (FINUFFT_FFTW_LIBRARIES STREQUAL DOWNLOAD)) - if(FINUFFT_FFTW_SUFFIX STREQUAL THREADS) - set(FINUFFT_USE_THREADS ON) - else() - set(FINUFFT_USE_THREADS OFF) - endif() - cpmaddpackage( - NAME - fftw3 - OPTIONS - "ENABLE_SSE2 ON" - "ENABLE_AVX ON" - "ENABLE_AVX2 ON" - "BUILD_TESTS OFF" - "BUILD_SHARED_LIBS OFF" - "ENABLE_THREADS ${FINUFFT_USE_THREADS}" - "ENABLE_OPENMP ${FINUFFT_USE_OPENMP}" - URL - "http://www.fftw.org/fftw-${FFTW_VERSION}.tar.gz" - URL_HASH - "MD5=8ccbf6a5ea78a16dbc3e1306e234cc5c" - EXCLUDE_FROM_ALL - YES - GIT_SHALLOW - YES) - - cpmaddpackage( - NAME - fftw3f - OPTIONS - "ENABLE_SSE2 ON" - "ENABLE_AVX ON" - "ENABLE_AVX2 ON" - "ENABLE_FLOAT ON" - "BUILD_TESTS OFF" - "BUILD_SHARED_LIBS OFF" - "ENABLE_THREADS ${FINUFFT_USE_THREADS}" - "ENABLE_OPENMP ${FINUFFT_USE_OPENMP}" - URL - "http://www.fftw.org/fftw-${FFTW_VERSION}.tar.gz" - URL_HASH - "MD5=8ccbf6a5ea78a16dbc3e1306e234cc5c" - EXCLUDE_FROM_ALL - YES - GIT_SHALLOW - YES) - set(FINUFFT_FFTW_LIBRARIES fftw3 fftw3f) - if(FINUFFT_USE_THREADS) - list(APPEND FINUFFT_FFTW_LIBRARIES fftw3_threads fftw3f_threads) - elseif(FINUFFT_USE_OPENMP) - list(APPEND FINUFFT_FFTW_LIBRARIES fftw3_omp fftw3f_omp) - endif() +if(FINUFFT_FFTW_LIBRARIES STREQUAL DEFAULT OR FINUFFT_FFTW_LIBRARIES STREQUAL DOWNLOAD) + find_package(FFTW) + if((NOT FFTW_FOUND) OR (FINUFFT_FFTW_LIBRARIES STREQUAL DOWNLOAD)) + if(FINUFFT_FFTW_SUFFIX STREQUAL THREADS) + set(FINUFFT_USE_THREADS ON) + else() + set(FINUFFT_USE_THREADS OFF) + endif() + CPMAddPackage( + NAME + fftw3 + OPTIONS + "ENABLE_SSE2 ON" + "ENABLE_AVX ON" + "ENABLE_AVX2 ON" + "BUILD_TESTS OFF" + "BUILD_SHARED_LIBS OFF" + "ENABLE_THREADS ${FINUFFT_USE_THREADS}" + "ENABLE_OPENMP ${FINUFFT_USE_OPENMP}" + URL + "http://www.fftw.org/fftw-${FFTW_VERSION}.tar.gz" + URL_HASH + "MD5=8ccbf6a5ea78a16dbc3e1306e234cc5c" + EXCLUDE_FROM_ALL + YES + GIT_SHALLOW + YES + ) - foreach(element IN LISTS FINUFFT_FFTW_LIBRARIES) - set_target_properties( - ${element} - PROPERTIES MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>" - POSITION_INDEPENDENT_CODE - ${FINUFFT_POSITION_INDEPENDENT_CODE}) - endforeach() + CPMAddPackage( + NAME + fftw3f + OPTIONS + "ENABLE_SSE2 ON" + "ENABLE_AVX ON" + "ENABLE_AVX2 ON" + "ENABLE_FLOAT ON" + "BUILD_TESTS OFF" + "BUILD_SHARED_LIBS OFF" + "ENABLE_THREADS ${FINUFFT_USE_THREADS}" + "ENABLE_OPENMP ${FINUFFT_USE_OPENMP}" + URL + "http://www.fftw.org/fftw-${FFTW_VERSION}.tar.gz" + URL_HASH + "MD5=8ccbf6a5ea78a16dbc3e1306e234cc5c" + EXCLUDE_FROM_ALL + YES + GIT_SHALLOW + YES + ) + set(FINUFFT_FFTW_LIBRARIES fftw3 fftw3f) + if(FINUFFT_USE_THREADS) + list(APPEND FINUFFT_FFTW_LIBRARIES fftw3_threads fftw3f_threads) + elseif(FINUFFT_USE_OPENMP) + list(APPEND FINUFFT_FFTW_LIBRARIES fftw3_omp fftw3f_omp) + endif() - target_include_directories( - fftw3 PUBLIC $) + foreach(element IN LISTS FINUFFT_FFTW_LIBRARIES) + set_target_properties( + ${element} + PROPERTIES + MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>" + POSITION_INDEPENDENT_CODE ${FINUFFT_POSITION_INDEPENDENT_CODE} + ) + endforeach() - else() - set(FINUFFT_FFTW_LIBRARIES - "FFTW::Float" "FFTW::Double" "FFTW::Float${FINUFFT_FFTW_SUFFIX}" - "FFTW::Double${FINUFFT_FFTW_SUFFIX}") - endif() + target_include_directories(fftw3 PUBLIC $) + else() + set(FINUFFT_FFTW_LIBRARIES + "FFTW::Float" + "FFTW::Double" + "FFTW::Float${FINUFFT_FFTW_SUFFIX}" + "FFTW::Double${FINUFFT_FFTW_SUFFIX}" + ) + endif() endif() diff --git a/cmake/setupSphinx.cmake b/cmake/setupSphinx.cmake index 996419b5c..30270da9e 100644 --- a/cmake/setupSphinx.cmake +++ b/cmake/setupSphinx.cmake @@ -1,21 +1,23 @@ -cpmaddpackage( - NAME - sphinx_cmake - GIT_REPOSITORY - https://github.com/k0ekk0ek/cmake-sphinx.git - GIT_TAG - e13c40a - DOWNLOAD_ONLY - YES) +CPMAddPackage( + NAME + sphinx_cmake + GIT_REPOSITORY + https://github.com/k0ekk0ek/cmake-sphinx.git + GIT_TAG + e13c40a + DOWNLOAD_ONLY + YES +) list(APPEND CMAKE_MODULE_PATH ${sphinx_cmake_SOURCE_DIR}/cmake/Modules) # requires pip install sphinx texext find_package(Sphinx) if(SPHINX_FOUND) - message(STATUS "Sphinx found") - sphinx_add_docs(finufft_sphinx BUILDER html SOURCE_DIRECTORY - ${FINUFFT_SOURCE_DIR}/docs) + message(STATUS "Sphinx found") + sphinx_add_docs(finufft_sphinx BUILDER html SOURCE_DIRECTORY + ${FINUFFT_SOURCE_DIR}/docs + ) else() - message(STATUS "Sphinx not found docs will not be generated") + message(STATUS "Sphinx not found docs will not be generated") endif() diff --git a/cmake/setupXSIMD.cmake b/cmake/setupXSIMD.cmake index 5f31ae61e..475412514 100644 --- a/cmake/setupXSIMD.cmake +++ b/cmake/setupXSIMD.cmake @@ -1,28 +1,30 @@ -cpmaddpackage( - NAME - xtl - GIT_REPOSITORY - "https://github.com/xtensor-stack/xtl.git" - GIT_TAG - ${XTL_VERSION} - EXCLUDE_FROM_ALL - YES - GIT_SHALLOW - YES - OPTIONS - "XTL_DISABLE_EXCEPTIONS YES") +CPMAddPackage( + NAME + xtl + GIT_REPOSITORY + "https://github.com/xtensor-stack/xtl.git" + GIT_TAG + ${XTL_VERSION} + EXCLUDE_FROM_ALL + YES + GIT_SHALLOW + YES + OPTIONS + "XTL_DISABLE_EXCEPTIONS YES" +) -cpmaddpackage( - NAME - xsimd - GIT_REPOSITORY - "https://github.com/xtensor-stack/xsimd.git" - GIT_TAG - ${XSIMD_VERSION} - EXCLUDE_FROM_ALL - YES - GIT_SHALLOW - YES - OPTIONS - "XSIMD_SKIP_INSTALL YES" - "XSIMD_ENABLE_XTL_COMPLEX YES") +CPMAddPackage( + NAME + xsimd + GIT_REPOSITORY + "https://github.com/xtensor-stack/xsimd.git" + GIT_TAG + ${XSIMD_VERSION} + EXCLUDE_FROM_ALL + YES + GIT_SHALLOW + YES + OPTIONS + "XSIMD_SKIP_INSTALL YES" + "XSIMD_ENABLE_XTL_COMPLEX YES" +) diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 5aab67dc7..bcbbb5169 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -2,88 +2,76 @@ include(CheckCXXCompilerFlag) # Define the function function(filter_supported_compiler_flags input_flags_var output_flags_var) - # Create an empty list to store supported flags - set(supported_flags) - # Iterate over each flag in the input list - set(ORIGINAL_LINKER_FLAGS ${CMAKE_EXE_LINKER_FLAGS}) - foreach(flag ${${input_flags_var}}) - string(REPLACE "=" "_" flag_var ${flag}) # Convert flag to a valid variable - # name - string(REPLACE "-" "" flag_var ${flag_var}) # Remove '-' for the variable - # name Append the test linker flag to the existing flags - set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${flag}") - check_cxx_compiler_flag(${flag} ${flag_var}) - if(${flag_var}) - # If supported, append the flag to the list of supported flags - list(APPEND supported_flags ${flag}) - else() - message(STATUS "Flag ${flag} is not supported") - endif() - unset(${flag_var} CACHE) - # remove last flag from CMAKE_EXE_LINKER_FLAGS using substring - set(CMAKE_EXE_LINKER_FLAGS ${ORIGINAL_LINKER_FLAGS}) - endforeach() - # Set the output variable to the list of supported flags - set(${output_flags_var} - ${supported_flags} - PARENT_SCOPE) + # Create an empty list to store supported flags + set(supported_flags) + # Iterate over each flag in the input list + set(ORIGINAL_LINKER_FLAGS ${CMAKE_EXE_LINKER_FLAGS}) + foreach(flag ${${input_flags_var}}) + string(REPLACE "=" "_" flag_var ${flag}) # Convert flag to a valid variable + # name + string(REPLACE "-" "" flag_var ${flag_var}) # Remove '-' for the variable + # name Append the test linker flag to the existing flags + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${flag}") + check_cxx_compiler_flag(${flag} ${flag_var}) + if(${flag_var}) + # If supported, append the flag to the list of supported flags + list(APPEND supported_flags ${flag}) + else() + message(STATUS "Flag ${flag} is not supported") + endif() + unset(${flag_var} CACHE) + # remove last flag from CMAKE_EXE_LINKER_FLAGS using substring + set(CMAKE_EXE_LINKER_FLAGS ${ORIGINAL_LINKER_FLAGS}) + endforeach() + # Set the output variable to the list of supported flags + set(${output_flags_var} ${supported_flags} PARENT_SCOPE) endfunction() function(check_arch_support) - message(STATUS "Checking for AVX, AVX512 and SSE support") - try_run( - RUN_RESULT_VAR COMPILE_RESULT_VAR ${CMAKE_BINARY_DIR} - ${CMAKE_CURRENT_SOURCE_DIR}/cmake/CheckAVX.cpp - COMPILE_OUTPUT_VARIABLE COMPILE_OUTPUT - RUN_OUTPUT_VARIABLE RUN_OUTPUT) - if(RUN_OUTPUT MATCHES "AVX512") - set(FINUFFT_ARCH_FLAGS - "/arch:AVX512" - CACHE STRING "" FORCE) - elseif(RUN_OUTPUT MATCHES "AVX2") - set(FINUFFT_ARCH_FLAGS - "/arch:AVX2" - CACHE STRING "" FORCE) - elseif(RUN_OUTPUT MATCHES "AVX") - set(FINUFFT_ARCH_FLAGS - "/arch:AVX" - CACHE STRING "" FORCE) - elseif(RUN_OUTPUT MATCHES "SSE") - set(FINUFFT_ARCH_FLAGS - "/arch:SSE" - CACHE STRING "" FORCE) - else() - set(FINUFFT_ARCH_FLAGS - "" - CACHE STRING "" FORCE) - endif() - message(STATUS "CPU supports: ${RUN_OUTPUT}") - message(STATUS "Using MSVC flags: ${FINUFFT_ARCH_FLAGS}") + message(STATUS "Checking for AVX, AVX512 and SSE support") + try_run( + RUN_RESULT_VAR + COMPILE_RESULT_VAR + ${CMAKE_BINARY_DIR} + ${CMAKE_CURRENT_SOURCE_DIR}/cmake/CheckAVX.cpp + COMPILE_OUTPUT_VARIABLE COMPILE_OUTPUT + RUN_OUTPUT_VARIABLE RUN_OUTPUT + ) + if(RUN_OUTPUT MATCHES "AVX512") + set(FINUFFT_ARCH_FLAGS "/arch:AVX512" CACHE STRING "" FORCE) + elseif(RUN_OUTPUT MATCHES "AVX2") + set(FINUFFT_ARCH_FLAGS "/arch:AVX2" CACHE STRING "" FORCE) + elseif(RUN_OUTPUT MATCHES "AVX") + set(FINUFFT_ARCH_FLAGS "/arch:AVX" CACHE STRING "" FORCE) + elseif(RUN_OUTPUT MATCHES "SSE") + set(FINUFFT_ARCH_FLAGS "/arch:SSE" CACHE STRING "" FORCE) + else() + set(FINUFFT_ARCH_FLAGS "" CACHE STRING "" FORCE) + endif() + message(STATUS "CPU supports: ${RUN_OUTPUT}") + message(STATUS "Using MSVC flags: ${FINUFFT_ARCH_FLAGS}") endfunction() function(copy_dll source_target destination_target) - if(NOT WIN32) - return() - endif() - # Get the binary directory of the destination target - get_target_property(DESTINATION_DIR ${destination_target} BINARY_DIR) - set(DESTINATION_FILE ${DESTINATION_DIR}/$) - if(NOT EXISTS ${DESTINATION_FILE}) - message( - STATUS - "Copying ${source_target} to ${DESTINATION_DIR} directory for ${destination_target}" - ) - # Define the custom command to copy the source target to the destination - # directory - add_custom_command( - TARGET ${destination_target} - POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy $ - ${DESTINATION_FILE} - COMMENT "Copying ${source_target} to ${destination_target} directory") - endif() - # Unset the variables to leave a clean state - unset(DESTINATION_DIR) - unset(SOURCE_FILE) - unset(DESTINATION_FILE) + if(NOT WIN32) + return() + endif() + # Get the binary directory of the destination target + get_target_property(DESTINATION_DIR ${destination_target} BINARY_DIR) + set(DESTINATION_FILE ${DESTINATION_DIR}/$) + if(NOT EXISTS ${DESTINATION_FILE}) + message(STATUS "Copying ${source_target} to ${DESTINATION_DIR} directory for ${destination_target}") + # Define the custom command to copy the source target to the destination + # directory + add_custom_command( + TARGET ${destination_target} + POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy $ ${DESTINATION_FILE} + COMMENT "Copying ${source_target} to ${destination_target} directory" + ) + endif() + # Unset the variables to leave a clean state + unset(DESTINATION_DIR) + unset(SOURCE_FILE) + unset(DESTINATION_FILE) endfunction() diff --git a/contributing.md b/contributing.md index ad79e9abf..14f2abe53 100644 --- a/contributing.md +++ b/contributing.md @@ -1,4 +1,4 @@ -This repository is formatted according to the .clang-format in the root directory. +This repository is formatted according to the .clang-format in the root directory. Please enable the reformatting hook before committing your changes. See [pre-commit](https://pre-commit.com/) for more information. A quick summary: diff --git a/devel/CMakeLists.txt b/devel/CMakeLists.txt index 45b9a5989..d9d2251bd 100644 --- a/devel/CMakeLists.txt +++ b/devel/CMakeLists.txt @@ -3,19 +3,20 @@ project(finufft_devel) cmake_minimum_required(VERSION 3.5) # include cpm cmake, downloading it -cpmaddpackage( - NAME - benchmark - GITHUB_REPOSITORY - google/benchmark - VERSION - 1.8.3 - OPTIONS - "BENCHMARK_ENABLE_TESTING OFF") +CPMAddPackage( + NAME + benchmark + GITHUB_REPOSITORY + google/benchmark + VERSION + 1.8.3 + OPTIONS + "BENCHMARK_ENABLE_TESTING OFF" +) if(benchmark_ADDED) - # patch benchmark target - set_target_properties(benchmark PROPERTIES CXX_STANDARD 17) + # patch benchmark target + set_target_properties(benchmark PROPERTIES CXX_STANDARD 17) endif() add_executable(foldrescale foldrescale.cpp) diff --git a/devel/GuruInterfaceBranch_BrainDump b/devel/GuruInterfaceBranch_BrainDump index f8ef8fd49..43a2fba8e 100644 --- a/devel/GuruInterfaceBranch_BrainDump +++ b/devel/GuruInterfaceBranch_BrainDump @@ -19,7 +19,7 @@ the makefile line "CFLAGS += -I include" ensures all headers inside of this "inc available to a file if they are referred to with the former syntax. |nufft_opts.h| : Additional parameter spread_scheme {0:Sequential Multithreaded, 1: Nested Multithreaded}. - The default spread_scheme is set to 0. + The default spread_scheme is set to 0. **Src Directory** @@ -45,14 +45,14 @@ available to a file if they are referred to with the former syntax. 1) finufft_default_opts(): identical to original version + extra spreading_scheme parameter set default to 0 : sequential multithreaded scheme. Set to 1 to switch to simultaneous single threaded with nested at the end. Description of schemes below. - + 2) finufft_makeplan(): sets the fields of the plan struct. ==================================Types 1 + 2============================================= Allocates the internal fw array that is the workspace for fftw_execute(). For cases where n_trans > 1, user supplied c and F arrays are stacked arrays of size M*n_transf and N*n_transf respectively. Because finufft_exec() performs its steps in batches of size threadBlkSize - (a parameter for this function) Internal arrays like fw (and cpj, see finufft_exec) are still + (a parameter for this function) Internal arrays like fw (and cpj, see finufft_exec) are still stacked, but only threadBlkSize times. I.e. in 1d fw is of size nf1*threadBlkSize. Sets of weights are spread to the uniform grid threadBlkSize at a time. This holds true for all n_transf >= threadBlkSize. If n_transf < threadBlkSize, then internal arrays are only stacked @@ -68,9 +68,9 @@ available to a file if they are referred to with the former syntax. to not run into trouble, however the contents of the array at the end are garbage- never filled in with sets of spread/interpolated weights. They are not read out either, so there is no effect. This course was chosen instead of creating a new fftw_plan on the last round with a different - "how_many" and calling fftw_exec on the new plan. - - 3) finufft_setpts(): sets the coordinate pointers with the provided pts. + "how_many" and calling fftw_exec on the new plan. + + 3) finufft_setpts(): sets the coordinate pointers with the provided pts. ==================================Type 1 + 2================================================ @@ -89,15 +89,15 @@ available to a file if they are referred to with the former syntax. The target frequencies are scaled, and the spreading kernel fourier weights are computed. PhiHat is the name of the array in the finufft_plan struct referred to as fwker for type 1+2 and fkker - in type 3 in older code. - - + in type 3 in older code. + + 4) finufft_exec(): Main body for both types is contained in a loop over the number of batches. Inside are calls to spread, interp, deconvolve, prephase as appropriate for type, and fftw_exec. nBatches = ceil(n_transf/threadBlkSize). The number of sets in a batch is equal to min(threadBlkSize, n_transf). Where n_transf > threadBlkSize, i.e. nBatches > 1, for the last batch, the number of sets in the batch is modulus n_transf/threadBlkSize. - The number of sets in a batch is the number times that the spread, interpolated, deconvolve, + The number of sets in a batch is the number times that the spread, interpolated, deconvolve, and prephase routines execute. Thus there are no extra calls to any of these functions on the last loop of execution. Recall that this is not the case for fftw_exec - internally it will do more work than necessary on the last round if nBatches > 1. @@ -134,10 +134,10 @@ available to a file if they are referred to with the former syntax. because the type 2 n_transf is always less than or equal to the threadBlkSize. On the last type 3 execute batch, the interior type 2 n_transf field must be set to the modulus of n_transf/threadBlkSize, because the cj and fk arrays provided by the user are only the - stacked to {M,N}*n_transf, so extra work at the end would try to access outside of allocated memory. + stacked to {M,N}*n_transf, so extra work at the end would try to access outside of allocated memory. - 5) finnuft_destroy(): releases all memory allocated internally by finufft and calls fftw_destroy on the plan. + 5) finnuft_destroy(): releases all memory allocated internally by finufft and calls fftw_destroy on the plan. **Test Directory** @@ -149,7 +149,7 @@ available to a file if they are referred to with the former syntax. |dumbinputs| : has been expanded to check bad inputs across the extended legacy interface -|finufftGuru_test| : Main demo code for the guru interface and test program executable. +|finufftGuru_test| : Main demo code for the guru interface and test program executable. Note on Usage: spread_scheme parameter after the debug that allows user to choose sequential multithreaded spread vs simultaneous single threaded+nested last. Default is sequential multithreaded spread/interp. @@ -173,7 +173,7 @@ available to a file if they are referred to with the former syntax. if n_trials = [1,10,100] at the top of the script pattern of any given printed array: [dim1_1trial, dim1_10trials, dim1_100trials, dim2_1trial, etc ] - + Run this on a rusty node and you should find total time ratios ~>= 1 for *almost* all of the big problems defaulted in this script, indicating that the old implementation took longer than the new. @@ -185,7 +185,7 @@ available to a file if they are referred to with the former syntax. n_trials grows, evidence for completion of a primary goal: reduce finufft_plan retrieval overhead time in repeated executions. *** - fftw_exec times are printed last. Same sanity check as above applies. + fftw_exec times are printed last. Same sanity check as above applies. |spreadingSchemeStats.py| : this script runs a single small problem and a single large problem for the two spreading schemes. See #4 of Timing Questions and Conclusions for further information on the schemes. The output of @@ -204,11 +204,11 @@ available to a file if they are referred to with the former syntax. The file timingResults/timingBreakdowns_largeProblems.out contain stats reported from timingBreakdowns.py for a problem size 1e7. The files timingResults/timingBreakdowns_smallProblems_* contain stats reported for timingBreaddowns.py for a problem of size 1e4. All of these show a decrease in time spent in fftw_plan linearly proportional to the number of transforms computed. - + 2) Speedup due to non-repeated sorting? The Spreading results section in timingResults/timingBreakdowns_sortingDominates.out, run with M = 1e7 and N = 1e4, - provides good evidence for the guru speedup due to time advantage of single sort over repeated sort. - + provides good evidence for the guru speedup due to time advantage of single sort over repeated sort. + 3) FFTW and single vs many plan and execution? Does FFTW do better performing one fftw_execute many times in a loop or a single time over many sets? For large problems, many execution looks better than repeated single, however trials @@ -252,7 +252,7 @@ available to a file if they are referred to with the former syntax. investigation. The code being referred to is identical in spreadAllSetsInBatch and interpAllSetsInBatch in finufft.cpp. The lines beginning at the declaration of the n_outerThreads variable down to #pragma omp parallel before the for-loop, and then the three lines at the bottom that reset nested to 0. - + **Testing Gotchas...BEWARE** 1) |FFTW library clean slate|: Even with fftw_destoy_plan(&plan) [inside of finufft_destroy] and fftw_forget_wisdom() diff --git a/devel/agenda_11-27-23.txt b/devel/agenda_11-27-23.txt index 6db640361..f2cb6a6c7 100644 --- a/devel/agenda_11-27-23.txt +++ b/devel/agenda_11-27-23.txt @@ -45,7 +45,7 @@ library..... - jax waiting on this - ask Robert: unclear why some fftw_free or alloc are mutexed and not others? - ahb tested w/ all make tasks.] - + GPU: PR 330: deal w/ cuda streams - difficult for ahb to test - jax waiting on this - multicoil MRI users have opinions @@ -55,7 +55,7 @@ remove obsolete bkw-compat (nufft_opts) ? wrappers.... -Libin to fix (1,M) vs (M,) Py issue #367 +Libin to fix (1,M) vs (M,) Py issue #367 [Libin did PR for #359, but not closed? I tested py and closed] @@ -141,4 +141,3 @@ CI is not mentioned. #569 ? onedim_fseries_kernel ... was not brought in Can I kill .travis.yml ? (3 yrs old - what's it for?) - diff --git a/devel/check_dumbinputs.cmake b/devel/check_dumbinputs.cmake index 4a722fa8b..8ef110189 100644 --- a/devel/check_dumbinputs.cmake +++ b/devel/check_dumbinputs.cmake @@ -2,18 +2,18 @@ # used by test/CMakeLists.txt # pick place for stdout to be saved -set(tempout - "${CMAKE_CURRENT_BINARY_DIR}/dumbinputs${CMAKE_ARGV3}.out" - ) +set(tempout "${CMAKE_CURRENT_BINARY_DIR}/dumbinputs${CMAKE_ARGV3}.out") # pipe dumbinputs there. Is it platform-indep? # https://stackoverflow.com/questions/36304289/how-to-use-redirection-in-cmake-add-test execute_process(COMMAND ${CMAKE_CURRENT_BINARY_DIR}/dumbinputs OUTPUT_FILE ${tempout}) # diff the output against reference -execute_process(COMMAND ${CMAKE_COMMAND} -E compare_files --ignore-eol ${tempout} ${CMAKE_CURRENT_SOURCE_DIR}/results/dumbinputs${CMAKE_ARGV3}.refout) - - +execute_process( + COMMAND + ${CMAKE_COMMAND} -E compare_files --ignore-eol ${tempout} + ${CMAKE_CURRENT_SOURCE_DIR}/results/dumbinputs${CMAKE_ARGV3}.refout +) # ================== the above was an example extra .cmake script # in an attempt at platform-indep. @@ -21,35 +21,35 @@ execute_process(COMMAND ${CMAKE_COMMAND} -E compare_files --ignore-eol ${tempout # here are other ideas from the CMakeLists.txt by Alex, and Libin: if(0) -# Here's where to save output of dumbinputs for later diff -set(tempout - "${CMAKE_CURRENT_BINARY_DIR}/dumbinputs${SUFFIX}.out" - ) -# follow https://stackoverflow.com/questions/36304289/how-to-use-redirection-in-cmake-add-test -# https://stackoverflow.com/questions/39960173/run-custom-shell-script-with-cmake -# Is pipe here platform-indep? very hard to find out. sh or bash are not cross-platform :( -add_custom_target( - run_dumbinputs_${PREC} ALL - COMMENT "Custom target run of dumbinputs${SUFFIX}..." - # this still spews stuff to stderr which looks broken to the user; it is not... - COMMAND OMP_NUM_THREADS=1 ${CMAKE_CURRENT_BINARY_DIR}/dumbinputs${SUFFIX} > ${tempout} - # I don't know if correctly using dependency: it needs the current executable... - DEPENDS dumbinputs${SUFFIX} - ) -# I considered add_test() as another way (fails since can't handle the pipe >) -# and add_custom_command(), which fails to run unless add_custom_target used anyway. -# annoyingly, add_test COMMAND is quite lame: does not allow piping, as above, -# or multiple cmds -# See: https://stackoverflow.com/questions/3065220/ctest-with-multiple-commands?rq=4 -add_test( - NAME check_dumbinputs_${PREC} - COMMAND ${CMAKE_COMMAND} -E compare_files ${tempout} ${CMAKE_CURRENT_SOURCE_DIR}/results/dumbinputs${SUFFIX}.refout - ) + # Here's where to save output of dumbinputs for later diff + set(tempout "${CMAKE_CURRENT_BINARY_DIR}/dumbinputs${SUFFIX}.out") + # follow https://stackoverflow.com/questions/36304289/how-to-use-redirection-in-cmake-add-test + # https://stackoverflow.com/questions/39960173/run-custom-shell-script-with-cmake + # Is pipe here platform-indep? very hard to find out. sh or bash are not cross-platform :( + add_custom_target( + run_dumbinputs_${PREC} + ALL + COMMENT "Custom target run of dumbinputs${SUFFIX}..." + # this still spews stuff to stderr which looks broken to the user; it is not... + COMMAND OMP_NUM_THREADS=1 ${CMAKE_CURRENT_BINARY_DIR}/dumbinputs${SUFFIX} > ${tempout} + # I don't know if correctly using dependency: it needs the current executable... + DEPENDS dumbinputs${SUFFIX} + ) + # I considered add_test() as another way (fails since can't handle the pipe >) + # and add_custom_command(), which fails to run unless add_custom_target used anyway. + # annoyingly, add_test COMMAND is quite lame: does not allow piping, as above, + # or multiple cmds + # See: https://stackoverflow.com/questions/3065220/ctest-with-multiple-commands?rq=4 + add_test( + NAME check_dumbinputs_${PREC} + COMMAND + ${CMAKE_COMMAND} -E compare_files ${tempout} ${CMAKE_CURRENT_SOURCE_DIR}/results/dumbinputs${SUFFIX}.refout + ) endif() # another way of using add_test, remove add_custom_target by calling the devel/check_dumbinputs.cmake # if move check_dumbinputs.cmake to test directory, ${CMAKE_SOURCE_DIR} should be changed to ${CMAKE_CURRENT_SOURCE_DIR} add_test( - NAME check_dumbinputs_${PREC} - COMMAND ${CMAKE_COMMAND} -P ${CMAKE_SOURCE_DIR}/devel/check_dumbinputs.cmake ${SUFFIX} - ) + NAME check_dumbinputs_${PREC} + COMMAND ${CMAKE_COMMAND} -P ${CMAKE_SOURCE_DIR}/devel/check_dumbinputs.cmake ${SUFFIX} +) diff --git a/devel/compare_foldrescale_PR440_laptop5700U.txt b/devel/compare_foldrescale_PR440_laptop5700U.txt index 0335cdec2..8a1ef5276 100644 --- a/devel/compare_foldrescale_PR440_laptop5700U.txt +++ b/devel/compare_foldrescale_PR440_laptop5700U.txt @@ -111,7 +111,7 @@ spreadinterp 1D, 1e+06 U pts, dir=2, tol=0.1: nspread=2 1D Concl: single-thread 7% speedup interp (dir=2) - none to do with sorting 5% speedup spread dir=1. - multi-thread no significant change (~1% level). + multi-thread no significant change (~1% level). Also noted: PR #440 compile time for spreadinterp.o is 10x longer than before (~5 sec) @@ -144,7 +144,7 @@ spreadinterp 3D, 1e+06 U pts, dir=2, tol=0.1: nspread=2 t2 spreading loop: 0.752 s 1e+07 NU pts in 0.892 s 1.12e+07 pts/s 8.97e+07 spread pts/s max rel err in values at NU pts: 0.315 - + (base) alex@ross /home/alex/numerics/finufft> OMP_NUM_THREADS=8 perftest/spreadtestnd 3 1e8 1e6 1e-1 2 0 1 setup_spreader (kerevalmeth=1) eps=0.1 sigma=2: chose ns=2 beta=4.4 sorted (1 threads): 1.7e-05 s @@ -195,7 +195,7 @@ spreadinterp 3D, 1e+06 U pts, dir=2, tol=0.1: nspread=2 t2 spreading loop: 0.687 s 1e+07 NU pts in 0.829 s 1.21e+07 pts/s 9.65e+07 spread pts/s max rel err in values at NU pts: 0.315 - + (base) alex@ross /home/alex/numerics/finufft> OMP_NUM_THREADS=8 perftest/spreadtestnd 3 1e8 1e6 1e-1 2 0 1 setup_spreader (kerevalmeth=1) eps=0.1 sigma=2: chose ns=2 beta=4.4 sorted (1 threads): 1.8e-05 s @@ -226,4 +226,3 @@ concl: single-thread: spread no change; interp is 9% faster Overall: only affects single-core perf, and by 9% or less. (Of course, advantage of no 3pi-restriction is good too) - diff --git a/devel/cuda/cufinufft_tasks_meeting_Jun2023.txt b/devel/cuda/cufinufft_tasks_meeting_Jun2023.txt index c80a53f08..803418950 100644 --- a/devel/cuda/cufinufft_tasks_meeting_Jun2023.txt +++ b/devel/cuda/cufinufft_tasks_meeting_Jun2023.txt @@ -114,8 +114,3 @@ Low priority, unless it helps rewrite gpu code. MKL is faster a bit than FFTW, not worth including. Pocketfft is not as fast (ducc.FFT is better). M Reinecke. kissFFT terrible & inaccurate. - - - - - diff --git a/devel/eval_ker_expts.cpp b/devel/eval_ker_expts.cpp index 8da4a1699..36f22aaca 100644 --- a/devel/eval_ker_expts.cpp +++ b/devel/eval_ker_expts.cpp @@ -30,7 +30,7 @@ static inline void evaluate_kernel_vector(FLT *__restrict__ ker, #pragma omp simd for (int i = 0; i < N; i++) // Loop 1: Compute exponential arguments ker[i] = beta * sqrt(1.0 - c * args[i] * args[i]); - // ker[i] = beta * (1.0 - c*args[i]*args[i]); // no-sqrt version + // ker[i] = beta * (1.0 - c*args[i]*args[i]); // no-sqrt version #pragma omp simd for (int i = 0; i < N; i++) // Loop 2: Compute exponentials diff --git a/devel/finufft_meeting-7-5-23.txt b/devel/finufft_meeting-7-5-23.txt index 3e843d84c..691a54422 100644 --- a/devel/finufft_meeting-7-5-23.txt +++ b/devel/finufft_meeting-7-5-23.txt @@ -70,4 +70,3 @@ Reinecke perf improvements CPU: binsorting; interp in 2d/3d. 1.3 legacy cuf repo. docs - diff --git a/devel/foldrescale_spreadtest1d_slower.txt b/devel/foldrescale_spreadtest1d_slower.txt index 8b38e8463..2927b97cf 100644 --- a/devel/foldrescale_spreadtest1d_slower.txt +++ b/devel/foldrescale_spreadtest1d_slower.txt @@ -68,4 +68,3 @@ spreadinterp 1D, 8e+06 U pts, dir=2, tol=1e-06: nspread=7 t2 spreading loop: 0.197 s 8e+06 NU pts in 0.205 s 3.9e+07 pts/s 2.73e+08 spread pts/s max rel err in values at NU pts: 1.13e-06 - diff --git a/devel/gen_all_horner_cpp_header.m b/devel/gen_all_horner_cpp_header.m index d6d3a7e52..8d784971a 100644 --- a/devel/gen_all_horner_cpp_header.m +++ b/devel/gen_all_horner_cpp_header.m @@ -12,7 +12,7 @@ for upsampfac = [2.0, 1.25] % sigma: either 2 (default) or low (eg 5/4) fprintf('upsampfac = %g...\n',upsampfac) - + ws = 2:16; % list of widths (the driver, rather than tols) % (must match MIN and MAX NSPREAD in source headers) if upsampfac==2 @@ -24,7 +24,7 @@ fwrite(fid,sprintf('// fine-grid interval. Generated by gen_all_horner_cpp_header.m in devel/\n')); fwrite(fid,sprintf('// Authors: Alex Barnett, Ludvig af Klinteberg, Marco Barbone & Libin Lu.\n// (C) 2018--2024 The Simons Foundation, Inc.\n')); fwrite(fid,sprintf('#include \n\n')); - + usf_tag = sprintf('%d',100*upsampfac); % string, follow Barbone convention: 200 or 125 fwrite(fid,sprintf('template\nconstexpr auto get_horner_coeffs_%s() noexcept {\n',usf_tag)); @@ -33,7 +33,7 @@ [d,beta] = get_degree_and_beta(w,upsampfac); fprintf('w=%d\td=%d\tbeta=%.3g\n',w,d,beta); str = gen_ker_horner_loop_cpp_code(w,d,beta); % code strings for this w (nc hardcoded) - + if j==1 % write switch statement fwrite(fid,sprintf(' if constexpr (w == %d) {\n',w)); else @@ -48,5 +48,5 @@ fwrite(fid,sprintf(' static_assert(w <= %d, "w must be <= %d");\n',ws(end),ws(end))); fwrite(fid,sprintf(' };\n')); % close all brackets fclose(fid); - + end diff --git a/devel/ker_ppval_coeff_mat.m b/devel/ker_ppval_coeff_mat.m index 218d42393..34721eb89 100644 --- a/devel/ker_ppval_coeff_mat.m +++ b/devel/ker_ppval_coeff_mat.m @@ -1,5 +1,5 @@ function C = ker_ppval_coeff_mat(w,d,be,o) -% KER_PPVAL_COEFF_MAT matrix of piecewise poly ES kernel coeffs +% KER_PPVAL_COEFF_MAT matrix of piecewise poly ES kernel coeffs % % C = ker_ppval_coeff_mat(w,nterms,beta,opts) % @@ -77,4 +77,3 @@ w,d,tol,maxerr,maxerr/tol) end end - diff --git a/devel/padding.cpp b/devel/padding.cpp index 95ace9e12..727023ba7 100644 --- a/devel/padding.cpp +++ b/devel/padding.cpp @@ -275,4 +275,4 @@ int main(int argc, char *argv[]) { << uint64_t(get_padding()) << std::endl; return 0; -} \ No newline at end of file +} diff --git a/devel/plans_fall23.txt b/devel/plans_fall23.txt index 359d8cc8b..0cda26530 100644 --- a/devel/plans_fall23.txt +++ b/devel/plans_fall23.txt @@ -24,4 +24,3 @@ Robert + Joakim Go through Issues & prioritize. PRs. - diff --git a/devel/reverse_engineer_tol.m b/devel/reverse_engineer_tol.m index a57f013ab..cae196735 100644 --- a/devel/reverse_engineer_tol.m +++ b/devel/reverse_engineer_tol.m @@ -5,9 +5,9 @@ % % For fixed upsampfac (aka sigma), must be the inverse function for % how w is chosen from tol in spreadinterp.cpp:setup_spreader() - + % Barnett 7/22/24 - + if upsampfac==2.0 tol = 10^(1-w); else diff --git a/devel/test_ker_ppval.cpp b/devel/test_ker_ppval.cpp index 35d29d645..c044ac91c 100644 --- a/devel/test_ker_ppval.cpp +++ b/devel/test_ker_ppval.cpp @@ -61,9 +61,9 @@ static inline void evaluate_kernel_vector(FLT *ker, const FLT *args, const FLT b { #pragma omp simd for (int i = 0; i < w; i++) ker[i] = exp(beta * sqrt((FLT)1.0 - c * args[i] * args[i])); - // gcc 5.4 can't simd the combined loop, hence we split the - // out-of-support test to subsequent loop... - // This check loop prevents getting 0.2s (600 Meval/s): + // gcc 5.4 can't simd the combined loop, hence we split the + // out-of-support test to subsequent loop... + // This check loop prevents getting 0.2s (600 Meval/s): #pragma omp simd for (int i = 0; i < w; i++) if (fabs(args[i]) >= (FLT)w / 2) // note fabs not abs! diff --git a/devel/v2spec.md b/devel/v2spec.md index 0f1d01146..a15cacde3 100644 --- a/devel/v2spec.md +++ b/devel/v2spec.md @@ -47,7 +47,7 @@ int finufft1d1(BIGINT nj,FLT* xj,CPX* cj,int iflag,FLT eps,BIGINT ms, return ier; } -int invokeGuruInterface(int n_dims, int type, int n_transf, BIGINT nj, +int invokeGuruInterface(int n_dims, int type, int n_transf, BIGINT nj, FLT* xj,FLT *yj, FLT *zj, CPX* cj,int iflag, FLT eps, BIGINT *n_modes, BIGINT nk, FLT *s, FLT *t, FLT *u, CPX* fk, nufft_opts *opts_ptr) { finufft_plan plan; @@ -158,7 +158,7 @@ finufft.destroy(plan) N=2 # now N is interpreted as the dim ? or have separate dim argument?? plan = finufft.plan(3, N, ntransf, isign, tol) # (set x,y, sx, sy) -finufft.set_nu_pts(plan, x,y,None, sx,sy,None) # here sx,sy,sz are output NU +finufft.set_nu_pts(plan, x,y,None, sx,sy,None) # here sx,sy,sz are output NU finufft.execute(plan, c, f) # reads c and writes to f. # note c, f size must match ntransf finufft.destroy(plan) diff --git a/docs/Makefile b/docs/Makefile index 4824c849a..bfded925e 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -2,7 +2,7 @@ # # You can set these variables from the command line. -SPHINXOPTS = +SPHINXOPTS = SPHINXBUILD = sphinx-build PAPER = BUILDDIR = _build diff --git a/docs/c.rst b/docs/c.rst index a8d7ab494..c0e773c91 100644 --- a/docs/c.rst +++ b/docs/c.rst @@ -38,7 +38,7 @@ then set default values by passing a pointer (here ``opts``) to the following:: void finufft_default_opts(finufft_opts* opts) void finufftf_default_opts(finufft_opts* opts) - + Set values in a NUFFT options struct to their default values. Be sure to use the first version for double-precision and the second for single-precision. You may then change options with, for example, ``opts->debug=1;`` @@ -73,9 +73,9 @@ with the word "many" in the function name) perform ``ntr`` transforms with the s .. _guru: - + Guru plan interface -------------------- +------------------- This provides more flexibility than the simple or vectorized interfaces. Any transform requires (at least) @@ -90,4 +90,3 @@ Keep in mind that ``setpts`` retains *pointers* to the user's list of nonuniform The ``plan`` object (type ``finufft{f}_plan``) is an opaque pointer; the public interface specifies no more details that that. Under the hood in our library the plan happens to point to a C++ object of type ``finufft{f}_plan_s``, whose internal details the library user should not attempt to access, nor to rely on. .. include:: cguru.doc - diff --git a/docs/c1d.doc b/docs/c1d.doc index c16bc6241..a12401cae 100644 --- a/docs/c1d.doc +++ b/docs/c1d.doc @@ -1,23 +1,23 @@ :: - - int finufft1d1(int64_t M, double* x, complex* c, int iflag, double eps, int64_t + + int finufft1d1(int64_t M, double* x, complex* c, int iflag, double eps, int64_t N1, complex* f, finufft_opts* opts) - int finufftf1d1(int64_t M, float* x, complex* c, int iflag, float eps, int64_t N1, + int finufftf1d1(int64_t M, float* x, complex* c, int iflag, float eps, int64_t N1, complex* f, finufftf_opts* opts) - - int finufft1d1many(int ntr, int64_t M, double* x, complex* c, int iflag, double + + int finufft1d1many(int ntr, int64_t M, double* x, complex* c, int iflag, double eps, int64_t N1, complex* f, finufft_opts* opts) - int finufftf1d1many(int ntr, int64_t M, float* x, complex* c, int iflag, float + int finufftf1d1many(int ntr, int64_t M, float* x, complex* c, int iflag, float eps, int64_t N1, complex* f, finufftf_opts* opts) - + 1D complex nonuniform FFT of type 1 (nonuniform to uniform). - + Computes to precision eps, via a fast algorithm, one or more transforms of the form: - + M-1 f[k1] = SUM c[j] exp(+/-i k1 x(j)) for -N1/2 <= k1 <= (N1-1)/2 - j=0 - + j=0 + Inputs: ntr how many transforms (only for vectorized "many" functions, else ntr=1) M number of nonuniform point sources @@ -28,38 +28,38 @@ from 1e-1 down to ~ 1e-14 (in double precision) or 1e-6 (in single) N1 number of output Fourier modes to be computed opts pointer to options struct (see opts.rst), or NULL for defaults - + Outputs: f Fourier mode coefficients (size N1*ntr complex array) return value 0: success, 1: success but warning, >1: error (see error.rst) - + Notes: * complex arrays interleave Re, Im values, and their size is stated with dimensions ordered fastest to slowest. * Fourier frequency indices in each dimension i are the integers lying in [-Ni/2, (Ni-1)/2]. See above, and modeord in opts.rst for possible orderings. - - + + :: - - int finufft1d2(int64_t M, double* x, complex* c, int iflag, double eps, int64_t + + int finufft1d2(int64_t M, double* x, complex* c, int iflag, double eps, int64_t N1, complex* f, finufft_opts* opts) - int finufftf1d2(int64_t M, float* x, complex* c, int iflag, float eps, int64_t N1, + int finufftf1d2(int64_t M, float* x, complex* c, int iflag, float eps, int64_t N1, complex* f, finufftf_opts* opts) - - int finufft1d2many(int ntr, int64_t M, double* x, complex* c, int iflag, double + + int finufft1d2many(int ntr, int64_t M, double* x, complex* c, int iflag, double eps, int64_t N1, complex* f, finufft_opts* opts) - int finufftf1d2many(int ntr, int64_t M, float* x, complex* c, int iflag, float + int finufftf1d2many(int ntr, int64_t M, float* x, complex* c, int iflag, float eps, int64_t N1, complex* f, finufftf_opts* opts) - + 1D complex nonuniform FFT of type 2 (uniform to nonuniform). - + Computes to precision eps, via a fast algorithm, one or more transforms of the form: - + c[j] = SUM f[k1] exp(+/-i k1 x[j]) for j = 0,...,M-1 - k1 + k1 where the sum is over integers -N1/2 <= k1 <= (N1-1)/2. - + Inputs: ntr how many transforms (only for vectorized "many" functions, else ntr=1) M number of nonuniform point targets @@ -70,38 +70,38 @@ N1 number of input Fourier modes f Fourier mode coefficients (size N1*ntr complex array) opts pointer to options struct (see opts.rst), or NULL for defaults - + Outputs: c values at nonuniform point targets (size M*ntr complex array) return value 0: success, 1: success but warning, >1: error (see error.rst) - + Notes: * complex arrays interleave Re, Im values, and their size is stated with dimensions ordered fastest to slowest. * Fourier frequency indices in each dimension i are the integers lying in [-Ni/2, (Ni-1)/2]. See above, and modeord in opts.rst for possible orderings. - - + + :: - - int finufft1d3(int64_t M, double* x, complex* c, int iflag, double eps, int64_t + + int finufft1d3(int64_t M, double* x, complex* c, int iflag, double eps, int64_t N, double* s, complex* f, finufft_opts* opts) - int finufftf1d3(int64_t M, float* x, complex* c, int iflag, float eps, int64_t N, + int finufftf1d3(int64_t M, float* x, complex* c, int iflag, float eps, int64_t N, float* s, complex* f, finufftf_opts* opts) - - int finufft1d3many(int ntr, int64_t M, double* x, complex* c, int iflag, double + + int finufft1d3many(int ntr, int64_t M, double* x, complex* c, int iflag, double eps, int64_t N, double* s, complex* f, finufft_opts* opts) - int finufftf1d3many(int ntr, int64_t M, float* x, complex* c, int iflag, float + int finufftf1d3many(int ntr, int64_t M, float* x, complex* c, int iflag, float eps, int64_t N, float* s, complex* f, finufftf_opts* opts) - + 1D complex nonuniform FFT of type 3 (nonuniform to nonuniform). - + Computes to precision eps, via a fast algorithm, one or more transforms of the form: - + M-1 f[k] = SUM c[j] exp(+-i s[k] x[j]), for k = 0,...,N-1 j=0 - + Inputs: ntr how many transforms (only for vectorized "many" functions, else ntr=1) M number of nonuniform point sources @@ -113,11 +113,11 @@ N number of nonuniform frequency targets s nonuniform frequency targets in R (length N real array) opts pointer to options struct (see opts.rst), or NULL for defaults - + Outputs: f Fourier transform values at targets (size N*ntr complex array) return value 0: success, 1: success but warning, >1: error (see error.rst) - + Notes: * complex arrays interleave Re, Im values, and their size is stated with dimensions ordered fastest to slowest. diff --git a/docs/c1d.docsrc b/docs/c1d.docsrc index e8d563618..c1ba17fe3 100644 --- a/docs/c1d.docsrc +++ b/docs/c1d.docsrc @@ -4,7 +4,7 @@ int @F1d1(int64_t M, double* x, complex* c, int iflag, double eps, int64 @t M-1 f[k1] = SUM c[j] exp(+/-i k1 x(j)) for -N1/2 <= k1 <= (N1-1)/2 - j=0 + j=0 Inputs: @nt @@ -28,7 +28,7 @@ int @F1d2(int64_t M, double* x, complex* c, int iflag, double eps, int64 1D complex nonuniform FFT of type 2 (uniform to nonuniform). @t c[j] = SUM f[k1] exp(+/-i k1 x[j]) for j = 0,...,M-1 - k1 + k1 where the sum is over integers -N1/2 <= k1 <= (N1-1)/2. Inputs: diff --git a/docs/c2d.doc b/docs/c2d.doc index d9275bd95..aa0ee6379 100644 --- a/docs/c2d.doc +++ b/docs/c2d.doc @@ -1,25 +1,25 @@ :: - - int finufft2d1(int64_t M, double* x, double* y, complex* c, int iflag, double + + int finufft2d1(int64_t M, double* x, double* y, complex* c, int iflag, double eps, int64_t N1, int64_t N2, complex* f, finufft_opts* opts) - int finufftf2d1(int64_t M, float* x, float* y, complex* c, int iflag, float eps, + int finufftf2d1(int64_t M, float* x, float* y, complex* c, int iflag, float eps, int64_t N1, int64_t N2, complex* f, finufftf_opts* opts) - - int finufft2d1many(int ntr, int64_t M, double* x, double* y, complex* c, int + + int finufft2d1many(int ntr, int64_t M, double* x, double* y, complex* c, int iflag, double eps, int64_t N1, int64_t N2, complex* f, finufft_opts* opts) - int finufftf2d1many(int ntr, int64_t M, float* x, float* y, complex* c, int iflag, + int finufftf2d1many(int ntr, int64_t M, float* x, float* y, complex* c, int iflag, float eps, int64_t N1, int64_t N2, complex* f, finufftf_opts* opts) - + 2D complex nonuniform FFT of type 1 (nonuniform to uniform). - + Computes to precision eps, via a fast algorithm, one or more transforms of the form: - + M-1 f[k1,k2] = SUM c[j] exp(+/-i (k1 x[j] + k2 y[j])) j=0 - + for -N1/2 <= k1 <= (N1-1)/2, -N2/2 <= k2 <= (N2-1)/2. - + Inputs: ntr how many transforms (only for vectorized "many" functions, else ntr=1) M number of nonuniform point sources @@ -31,33 +31,33 @@ N1 number of output Fourier modes to be computed (x direction) N2 number of output Fourier modes to be computed (y direction) opts pointer to options struct (see opts.rst), or NULL for defaults - + Outputs: f Fourier mode coefficients (size N1*N2*ntr complex array) return value 0: success, 1: success but warning, >1: error (see error.rst) - + Notes: * complex arrays interleave Re, Im values, and their size is stated with dimensions ordered fastest to slowest. * Fourier frequency indices in each dimension i are the integers lying in [-Ni/2, (Ni-1)/2]. See above, and modeord in opts.rst for possible orderings. - + :: - - int finufft2d2(int64_t M, double* x, double* y, complex* c, int iflag, double + + int finufft2d2(int64_t M, double* x, double* y, complex* c, int iflag, double eps, int64_t N1, int64_t N2, complex* f, finufft_opts* opts) - int finufftf2d2(int64_t M, float* x, float* y, complex* c, int iflag, float eps, + int finufftf2d2(int64_t M, float* x, float* y, complex* c, int iflag, float eps, int64_t N1, int64_t N2, complex* f, finufftf_opts* opts) - - int finufft2d2many(int ntr, int64_t M, double* x, double* y, complex* c, int + + int finufft2d2many(int ntr, int64_t M, double* x, double* y, complex* c, int iflag, double eps, int64_t N1, int64_t N2, complex* f, finufft_opts* opts) - int finufftf2d2many(int ntr, int64_t M, float* x, float* y, complex* c, int iflag, + int finufftf2d2many(int ntr, int64_t M, float* x, float* y, complex* c, int iflag, float eps, int64_t N1, int64_t N2, complex* f, finufftf_opts* opts) - + 2D complex nonuniform FFT of type 2 (uniform to nonuniform). - + Computes to precision eps, via a fast algorithm, one or more transforms of the form: - + c[j] = SUM f[k1,k2] exp(+/-i (k1 x[j] + k2 y[j])) for j = 0,...,M-1 k1,k2 where the sum is over integers -N1/2 <= k1 <= (N1-1)/2, @@ -73,38 +73,38 @@ N2 number of input Fourier modes (y direction) f Fourier mode coefficients (size N1*N2*ntr complex array) opts pointer to options struct (see opts.rst), or NULL for defaults - + Outputs: c values at nonuniform point targets (size M*ntr complex array) return value 0: success, 1: success but warning, >1: error (see error.rst) - + Notes: * complex arrays interleave Re, Im values, and their size is stated with dimensions ordered fastest to slowest. * Fourier frequency indices in each dimension i are the integers lying in [-Ni/2, (Ni-1)/2]. See above, and modeord in opts.rst for possible orderings. - + :: - - int finufft2d3(int64_t M, double* x, double* y, complex* c, int iflag, double + + int finufft2d3(int64_t M, double* x, double* y, complex* c, int iflag, double eps, int64_t N, double* s, double* t, complex* f, finufft_opts* opts) - int finufftf2d3(int64_t M, float* x, float* y, complex* c, int iflag, float eps, + int finufftf2d3(int64_t M, float* x, float* y, complex* c, int iflag, float eps, int64_t N, float* s, float* t, complex* f, finufftf_opts* opts) - - int finufft2d3many(int ntr, int64_t M, double* x, double* y, complex* c, int - iflag, double eps, int64_t N, double* s, double* t, complex* f, finufft_opts* + + int finufft2d3many(int ntr, int64_t M, double* x, double* y, complex* c, int + iflag, double eps, int64_t N, double* s, double* t, complex* f, finufft_opts* opts) - int finufftf2d3many(int ntr, int64_t M, float* x, float* y, complex* c, int iflag, + int finufftf2d3many(int ntr, int64_t M, float* x, float* y, complex* c, int iflag, float eps, int64_t N, float* s, float* t, complex* f, finufftf_opts* opts) - + 2D complex nonuniform FFT of type 3 (nonuniform to nonuniform). - + Computes to precision eps, via a fast algorithm, one or more transforms of the form: - + M-1 f[k] = SUM c[j] exp(+-i (s[k] x[j] + t[k] y[j])), for k = 0,...,N-1 j=0 - + Inputs: ntr how many transforms (only for vectorized "many" functions, else ntr=1) M number of nonuniform point sources @@ -116,11 +116,11 @@ N number of nonuniform frequency targets s,t nonuniform frequency target coordinates in R^2 (length N real arrays) opts pointer to options struct (see opts.rst), or NULL for defaults - + Outputs: f Fourier transform values at targets (size N*ntr complex array) return value 0: success, 1: success but warning, >1: error (see error.rst) - + Notes: * complex arrays interleave Re, Im values, and their size is stated with dimensions ordered fastest to slowest. diff --git a/docs/c3d.doc b/docs/c3d.doc index d333d8190..9d6e50877 100644 --- a/docs/c3d.doc +++ b/docs/c3d.doc @@ -1,27 +1,27 @@ :: - - int finufft3d1(int64_t M, double* x, double* y, double* z, complex* c, int iflag, + + int finufft3d1(int64_t M, double* x, double* y, double* z, complex* c, int iflag, double eps, int64_t N1, int64_t N2, int64_t N3, complex* f, finufft_opts* opts) - int finufftf3d1(int64_t M, float* x, float* y, float* z, complex* c, int iflag, + int finufftf3d1(int64_t M, float* x, float* y, float* z, complex* c, int iflag, float eps, int64_t N1, int64_t N2, int64_t N3, complex* f, finufftf_opts* opts) - - int finufft3d1many(int ntr, int64_t M, double* x, double* y, double* z, complex* - c, int iflag, double eps, int64_t N1, int64_t N2, int64_t N3, complex* f, + + int finufft3d1many(int ntr, int64_t M, double* x, double* y, double* z, complex* + c, int iflag, double eps, int64_t N1, int64_t N2, int64_t N3, complex* f, finufft_opts* opts) - int finufftf3d1many(int ntr, int64_t M, float* x, float* y, float* z, complex* c, - int iflag, float eps, int64_t N1, int64_t N2, int64_t N3, complex* f, + int finufftf3d1many(int ntr, int64_t M, float* x, float* y, float* z, complex* c, + int iflag, float eps, int64_t N1, int64_t N2, int64_t N3, complex* f, finufftf_opts* opts) - + 3D complex nonuniform FFT of type 1 (nonuniform to uniform). - + Computes to precision eps, via a fast algorithm, one or more transforms of the form: - + M-1 f[k1,k2] = SUM c[j] exp(+/-i (k1 x[j] + k2 y[j] + k3 z[j])) j=0 - + for -N1/2 <= k1 <= (N1-1)/2, -N2/2 <= k2 <= (N2-1)/2, -N3/2 <= k3 <= (N3-1)/2 - + Inputs: ntr how many transforms (only for vectorized "many" functions, else ntr=1) M number of nonuniform point sources @@ -34,39 +34,39 @@ N2 number of output Fourier modes to be computed (y direction) N3 number of output Fourier modes to be computed (z direction) opts pointer to options struct (see opts.rst), or NULL for defaults - + Outputs: f Fourier mode coefficients (size N1*N2*N3*ntr complex array) return value 0: success, 1: success but warning, >1: error (see error.rst) - + Notes: * complex arrays interleave Re, Im values, and their size is stated with dimensions ordered fastest to slowest. * Fourier frequency indices in each dimension i are the integers lying in [-Ni/2, (Ni-1)/2]. See above, and modeord in opts.rst for possible orderings. - - + + :: - - int finufft3d2(int64_t M, double* x, double* y, double* z, complex* c, int iflag, + + int finufft3d2(int64_t M, double* x, double* y, double* z, complex* c, int iflag, double eps, int64_t N1, int64_t N2, int64_t N3, complex* f, finufft_opts* opts) - int finufftf3d2(int64_t M, float* x, float* y, float* z, complex* c, int iflag, + int finufftf3d2(int64_t M, float* x, float* y, float* z, complex* c, int iflag, float eps, int64_t N1, int64_t N2, int64_t N3, complex* f, finufftf_opts* opts) - - int finufft3d2many(int ntr, int64_t M, double* x, double* y, double* z, complex* - c, int iflag, double eps, int64_t N1, int64_t N2, int64_t N3, complex* f, + + int finufft3d2many(int ntr, int64_t M, double* x, double* y, double* z, complex* + c, int iflag, double eps, int64_t N1, int64_t N2, int64_t N3, complex* f, finufft_opts* opts) - int finufftf3d2many(int ntr, int64_t M, float* x, float* y, float* z, complex* c, - int iflag, float eps, int64_t N1, int64_t N2, int64_t N3, complex* f, + int finufftf3d2many(int ntr, int64_t M, float* x, float* y, float* z, complex* c, + int iflag, float eps, int64_t N1, int64_t N2, int64_t N3, complex* f, finufftf_opts* opts) - + 3D complex nonuniform FFT of type 2 (uniform to nonuniform). - + Computes to precision eps, via a fast algorithm, one or more transforms of the form: - + c[j] = SUM f[k1,k2,k3] exp(+/-i (k1 x[j] + k2 y[j] + k3 z[j])) k1,k2,k3 - + for j = 0,...,M-1, where the sum is over integers -N1/2 <= k1 <= (N1-1)/2, -N2/2 <= k2 <= (N2-1)/2, @@ -83,38 +83,38 @@ N3 number of input Fourier modes (z direction) f Fourier mode coefficients (size N1*N2*N3*ntr complex array) opts pointer to options struct (see opts.rst), or NULL for defaults - + Outputs: c values at nonuniform point targets (size M*ntr complex array) return value 0: success, 1: success but warning, >1: error (see error.rst) - + Notes: * complex arrays interleave Re, Im values, and their size is stated with dimensions ordered fastest to slowest. * Fourier frequency indices in each dimension i are the integers lying in [-Ni/2, (Ni-1)/2]. See above, and modeord in opts.rst for possible orderings. - - + + :: - - int finufft3d3(int64_t M, double* x, double* y, double* z, complex* c, int iflag, - double eps, int64_t N, double* s, double* t, double* u, complex* f, finufft_opts* + + int finufft3d3(int64_t M, double* x, double* y, double* z, complex* c, int iflag, + double eps, int64_t N, double* s, double* t, double* u, complex* f, finufft_opts* opts) - int finufftf3d3(int64_t M, float* x, float* y, float* z, complex* c, int iflag, - float eps, int64_t N, float* s, float* t, float* u, complex* f, finufftf_opts* + int finufftf3d3(int64_t M, float* x, float* y, float* z, complex* c, int iflag, + float eps, int64_t N, float* s, float* t, float* u, complex* f, finufftf_opts* opts) - - int finufft3d3many(int ntr, int64_t M, double* x, double* y, double* z, complex* - c, int iflag, double eps, int64_t N, double* s, double* t, double* u, complex* f, + + int finufft3d3many(int ntr, int64_t M, double* x, double* y, double* z, complex* + c, int iflag, double eps, int64_t N, double* s, double* t, double* u, complex* f, finufft_opts* opts) - int finufftf3d3many(int ntr, int64_t M, float* x, float* y, float* z, complex* c, - int iflag, float eps, int64_t N, float* s, float* t, float* u, complex* f, + int finufftf3d3many(int ntr, int64_t M, float* x, float* y, float* z, complex* c, + int iflag, float eps, int64_t N, float* s, float* t, float* u, complex* f, finufftf_opts* opts) - + 3D complex nonuniform FFT of type 3 (nonuniform to nonuniform). - + Computes to precision eps, via a fast algorithm, one or more transforms of the form: - + M-1 f[k] = SUM c[j] exp(+-i (s[k] x[j] + t[k] y[j] + u[k] z[j])), j=0 @@ -130,7 +130,11 @@ N number of nonuniform frequency targets s,t,u nonuniform frequency target coordinates in R^3 (length N real arrays) opts pointer to options struct (see opts.rst), or NULL for defaults - + Outputs: f Fourier transform values at targets (size N*ntr complex array) return value 0: success, 1: success but warning, >1: error (see error.rst) + + Notes: + * complex arrays interleave Re, Im values, and their size is stated with + dimensions ordered fastest to slowest. diff --git a/docs/c3d.docsrc b/docs/c3d.docsrc index 006f90722..f15fc74d7 100644 --- a/docs/c3d.docsrc +++ b/docs/c3d.docsrc @@ -79,4 +79,4 @@ int @F3d3(int64_t M, double* x, double* y, double* z, complex* c, int if Outputs: f Fourier transform values at targets (size N*ntr complex array) @r -@no \ No newline at end of file +@no diff --git a/docs/cguru.doc b/docs/cguru.doc index f21db430b..16c2f8485 100644 --- a/docs/cguru.doc +++ b/docs/cguru.doc @@ -1,16 +1,16 @@ :: - - int finufft_makeplan(int type, int dim, int64_t* nmodes, int iflag, int ntr, double eps, + + int finufft_makeplan(int type, int dim, int64_t* nmodes, int iflag, int ntr, double eps, finufft_plan* plan, finufft_opts* opts) - int finufftf_makeplan(int type, int dim, int64_t* nmodes, int iflag, int ntr, float eps, + int finufftf_makeplan(int type, int dim, int64_t* nmodes, int iflag, int ntr, float eps, finufftf_plan* plan, finufftf_opts* opts) - + Make a plan to perform one or more general transforms. - + Under the hood, for type 1 and 2, this does FFTW planning and kernel Fourier transform precomputation. For type 3, this does very little, since the FFT sizes are not yet known. - + Inputs: type type of transform (1,2, or 3) dim spatial dimension (1,2, or 3) @@ -22,36 +22,36 @@ eps desired relative precision; smaller is slower. This can be chosen from 1e-1 down to ~ 1e-14 (in double precision) or 1e-6 (in single) opts pointer to options struct (see opts.rst), or NULL for defaults - + Outputs: plan plan object (under the hood this is a pointer to another struct) return value 0: success, 1: success but warning, >1: error (see error.rst) - + Notes: * All available threads are planned by default (but see opts.nthreads) * The vectorized (many vector) plan, ie ntrans>1, can be much faster than repeated calls with the same nonuniform points. Note that here the I/O data ordering is stacked rather than interleaved. * For more details about the fields in the opts pointer, see opts.rst - - + + :: - - int finufft_setpts(finufft_plan plan, int64_t M, double* x, double* y, double* z, int64_t + + int finufft_setpts(finufft_plan plan, int64_t M, double* x, double* y, double* z, int64_t N, double* s, double* t, double* z) - int finufftf_setpts(finufftf_plan plan, int64_t M, float* x, float* y, float* z, int64_t + int finufftf_setpts(finufftf_plan plan, int64_t M, float* x, float* y, float* z, int64_t N, float* s, float* t, float* z) - + Input nonuniform points with coordinates x (and possibly y, and possibly z), and, if type 3, nonuniform frequency target coordinates s (and possibly t, and possibly u), into an existing plan. If type is 1 or 2 then the last four arguments are ignored. Unused dimensions are ignored. - + Under the hood, for type 1 or 2, this routine bin-sorts the points (storing just the permutation rather than new copies of the coordinates). For type 3 it also bin-sorts the frequencies, chooses two levels of grid sizes, then plans the inner type 2 call (interpolation and FFTW). - + Inputs: M number of nonuniform spatial points (used by all types) x nonuniform point x-coordinates (length M real array) @@ -66,13 +66,13 @@ ignored otherwise u if dim>2, nonuniform frequency z-coordinates (length N real array), ignored otherwise - + Input/Outputs: plan plan object - + Outputs: return value 0: success, 1: success but warning, >1: error (see error.rst) - + Notes: * The coordinates in x (and if nonempty, y and z) can be any real numbers. For type 1 and 2 transforms, their definitions imply that that the @@ -87,22 +87,22 @@ sizes is not too large (it controls the internal fine grid size). * The coordinates pointed to by any used arrays x, y, z, s, t, u must not be changed between this call and the below execute call! - - + + :: - + int finufft_execute(finufft_plan plan, complex* c, complex* f) int finufftf_execute(finufftf_plan plan, complex* c, complex* f) - + Perform one or more NUFFT transforms using previously entered nonuniform points and an existing plan. To summarize, this maps type 1: c -> f - type 2: f -> c + type 2: f -> c type 3: c -> f - + Inputs: plan plan object - + Input/Outputs: c For types 1 and 3, the input strengths at the nonuniform point sources (size M*ntr complex array). @@ -116,28 +116,28 @@ respectively). If type 3, the output values at the nonuniform frequency targets (size N*ntr complex array). - + Outputs: return value 0: success, 1: success but warning, >1: error (see error.rst) - + Notes: * The contents of the arrays x, y, z, s, t, u must not have changed since the finufft_setpts call that read them. The execution rereads them (this way of doing business saves RAM). * f and c are contiguous Fortran-style arrays with the transform number, if ntr>1, being the "slowest" (outer) dimension. - - + + :: - + int finufft_destroy(finufft_plan plan) int finufftf_destroy(finufftf_plan plan) - + Deallocate a plan object. This must be used upon clean-up, or before reusing a plan in another call to finufft_makeplan. - + Inputs/Outputs: plan plan object - + Outputs: return value 0: success, 1: success but warning, >1: error (see error.rst) diff --git a/docs/cguru.docsrc b/docs/cguru.docsrc index 49ab62df3..7747aab7f 100644 --- a/docs/cguru.docsrc +++ b/docs/cguru.docsrc @@ -83,7 +83,7 @@ int @G_execute(finufft_plan plan, complex* c, complex* f) Perform one or more NUFFT transforms using previously entered nonuniform points and an existing plan. To summarize, this maps type 1: c -> f - type 2: f -> c + type 2: f -> c type 3: c -> f Inputs: @@ -102,7 +102,7 @@ int @G_execute(finufft_plan plan, complex* c, complex* f) respectively). If type 3, the output values at the nonuniform frequency targets (size N*ntr complex array). - + Outputs: @r diff --git a/docs/install_gpu.rst b/docs/install_gpu.rst index 360543ac1..ec6a74b60 100644 --- a/docs/install_gpu.rst +++ b/docs/install_gpu.rst @@ -39,12 +39,12 @@ In order to configure cuFINUFFT for a specific compute capability, use the ``CMA To find out your own device's compute capability without having to look it up on the web, use: .. code-block:: bash - + nvidia-smi --query-gpu=compute_cap --format=csv,noheader This will return a text string such as ``8.6`` which would incidate ``sm_86`` architecture, thus to use ``CMAKE_CUDA_ARCHITECTURES=86``. - + Testing ------- diff --git a/docs/latexindex.rst b/docs/latexindex.rst index b98e67544..87ff7ebeb 100644 --- a/docs/latexindex.rst +++ b/docs/latexindex.rst @@ -8,7 +8,7 @@ :orphan: - + Flatiron Institute Nonuniform Fast Fourier Transform ===================================================== @@ -16,23 +16,23 @@ Flatiron Institute Nonuniform Fast Fourier Transform numbering. Note that ch.1 is now the overview (unlike in index.rst): .. toctree:: - + overview install dirs math - cex + cex c opts error trouble tut - fortran + fortran matlab python julia changelog - devnotes + devnotes related users ackn diff --git a/docs/makefile.doc b/docs/makefile.doc index 75684bde0..711eddae6 100644 --- a/docs/makefile.doc +++ b/docs/makefile.doc @@ -1,4 +1,5 @@ -Makefile for FINUFFT library. Please specify your task: +make[1]: Entering directory '/home/marco/repos/finufft' +Makefile for FINUFFT CPU library. Please specify your task: make lib - build the main library (in lib/ and lib-static/) make examples - compile and run all codes in examples/ make test - compile and run quick math validation tests @@ -22,3 +23,4 @@ Make options: You must at least 'make objclean' before changing such options! Also see docs/install.rst and docs/README +make[1]: Leaving directory '/home/marco/repos/finufft' diff --git a/docs/math.rst b/docs/math.rst index 89c3356ba..c535c985c 100644 --- a/docs/math.rst +++ b/docs/math.rst @@ -15,7 +15,7 @@ Type 1 and type 2 involve the Fourier "modes" (Fourier series coefficients) with integer frequency indices lying in the Cartesian product set .. math:: - + K = K_{N_1,\dots,N_d} := K_{N_1} \times K_{N_2} \times \dots \times K_{N_d}~, where @@ -40,21 +40,21 @@ Then the **type 1** (nonuniform to uniform, aka "adjoint") NUFFT evaluates .. math:: :label: 1 - + f_\mathbf{k} := \sum_{j=1}^M c_j e^{\pm i \mathbf{k}\cdot \mathbf{x}_j} \qquad \mbox{for } \mathbf{k} \in K - + This can be viewed as evaluating a set of Fourier series coefficients due to sources -with strengths $c_j$ at the arbitrary locations $\mathbf{x}_j$. +with strengths $c_j$ at the arbitrary locations $\mathbf{x}_j$. Either sign of the imaginary unit in the exponential can be chosen in the interface. Note that our normalization differs from that of references [DR,GL]. The **type 2** (U to NU, aka "forward") NUFFT evaluates .. math:: :label: 2 - + c_j := \sum_{\mathbf{k}\in K} f_\mathbf{k} e^{\pm i \mathbf{k}\cdot \mathbf{x}_j} \qquad \mbox{for } j=1,\ldots,M @@ -83,7 +83,7 @@ Then the type 3 transform evaluates: .. math:: :label: 3 - + f_k := \sum_{j=1}^M c_j e^{\pm i \mathbf{s}_k\cdot \mathbf{x}_j} \qquad \mbox{for } k=1,\ldots,N diff --git a/docs/matlab.rst b/docs/matlab.rst index ecd05f141..efede7970 100644 --- a/docs/matlab.rst +++ b/docs/matlab.rst @@ -32,7 +32,7 @@ their defaults, for instance: .. code-block:: matlab - o.modeord = 1; % choose FFT-style output mode ordering + o.modeord = 1; % choose FFT-style output mode ordering f = finufft1d1(x,c,+1,1e-12,N,o); % do it The above usage we call the "simple" interface. There is also a "vectorized" @@ -75,7 +75,7 @@ Here we use the guru interface to repeat the first demo above: delete(plan); % don't forget to clean up .. warning:: - + If an existing array is passed to ``setpts``, then this array must not be altered before ``execute`` is called! This is because, in order to save RAM (allowing larger problems to be solved), internally FINUFFT stores only *pointers* to ``x`` (etc), rather than unnecessarily duplicating this data. This is not true if an *expression* such as ``-x`` or ``2*pi*rand(M,1)`` is passed to ``setpts``, since in those cases the ``plan`` object does make internal copies, as per MATLAB's usual shallow-copy argument passing. Finally, we demo a 2D type 1 transform using the simple interface. Let's @@ -124,5 +124,5 @@ MATLAB path via something like ``addpath FINUFFT/matlab``, then .. literalinclude:: ../matlab/Contents.m The individual commands have the following help documentation: - + .. include:: matlabhelp.doc diff --git a/docs/nfft_migr.rst b/docs/nfft_migr.rst index 114807b1a..f7616447a 100644 --- a/docs/nfft_migr.rst +++ b/docs/nfft_migr.rst @@ -18,7 +18,7 @@ Also of interest (but not yet demonstrated below) is: * the forward NFFT transform (a.k.a. type 2) * the nonuniform to nonuniform NNFFT (a.k.a. type 3) - + .. note:: The NFFT3 library can do more things---real-valued data, sphere, rotation group, hyperbolic cross, inverse transforms---none of which FINUFFT can yet do directly (although our three transforms can be used as components in such tasks). We do not address those here. Migrating a 2D adjoint transform (type 1) in C from NFFT3 to FINUFFT diff --git a/docs/refs.rst b/docs/refs.rst index 11663596e..38eb4b37f 100644 --- a/docs/refs.rst +++ b/docs/refs.rst @@ -83,4 +83,3 @@ These may be a useful introduction to FINUFFT and applications. Yu-Hsuan (Melody) Shih's PDSEC2021 20-minute presentation video on cuFINUFFT is here: https://www.youtube.com/watch?v=PnW6ehMyHxM - diff --git a/docs/tut.rst b/docs/tut.rst index 57af9f67d..ed329707e 100644 --- a/docs/tut.rst +++ b/docs/tut.rst @@ -27,8 +27,7 @@ For further applications, see :ref:`references `, and: * `Fast Fresnel diffraction `_ for optics and acoustics applications. * `Equispaced Fourier methods for Gaussian process regression `_ as described in https://arxiv.org/abs/2210.10210 and https://arxiv.org/abs/2305.11065 - + * Tutorials for PyNUFFT with 1D and 2D reconstruction examples `here `_. - -* The numerical sampling of `random plane waves `_. +* The numerical sampling of `random plane waves `_. diff --git a/docs/tutorial/contft.rst b/docs/tutorial/contft.rst index 5574420f5..f00b2468b 100644 --- a/docs/tutorial/contft.rst +++ b/docs/tutorial/contft.rst @@ -72,7 +72,7 @@ Notice :eq:`fint` is simply a type 3 NUFFT with strengths $c_j = f(x_j) w_j$, so we evaluate it by calling FINUFFT (this takes 0.1 sec) then plot the resulting FT at its target $k$ points: .. code-block:: matlab - + tol = 1e-10; fhat = finufft1d3(xj, f(xj).*wj, +1, tol, k); plot(k, [real(fhat),imag(fhat)], '.'); @@ -152,7 +152,7 @@ you will have to shift (by it to this specific offset, then post-multiply ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The above $f$ was merely discontinuous. But you can now go further and easily -replace $(x_j,w_j)$ by a rule that is accurate for +replace $(x_j,w_j)$ by a rule that is accurate for a function with known singularities. Eg, say $f(x) = x^{-1/2}g(x)$ where $g$ is smooth on $[0,1]$, then the change of variable $x=y^2$ means that $\int_0^1 f(x) dx = \int_0^1 2y f(y) dy$, the latter having a @@ -182,7 +182,7 @@ This exhibits rapid convergence kinking in at a slightly higher $N$, while $\hat now has even slower decay (which one can check is $|k|^{-1/2}$): .. code-block:: matlab - + 180 0.208975054515039 200 3.04233050928417e-05 220 1.9202016281569e-10 @@ -221,7 +221,7 @@ Since $g$ is smooth, this will have spectral convergence in $n$ and $m$. Here is a fresh code to make this quadrature over $\Omega$: .. code-block:: matlab - + g = @(t) 1 + 0.3*cos(3*t); % boundary shape n = 280; % # theta nodes t = 2*pi*(1:n)/n; wt = (2*pi/n); % theta nodes, const weights @@ -235,7 +235,7 @@ Here is a fresh code to make this quadrature over $\Omega$: wj(jj) = wt*r^2*xr.*wr; % theta weight times rule for r.dr on (0,r) end plot([bx bx(1)],[by by(1)],'-'); hold on; plot(xj,yj,'.'); - + .. image:: ../pics/contft2dnodes.png :width: 50% @@ -257,7 +257,7 @@ become the source strengths. We also image the result on a log scale: .. image:: ../pics/contft2dans.png :width: 70% - + Thus we have computed the 2D FT of a discontinous function on a million-point grid to around 10-digit accuracy in 0.05 sec (the FINUFFT transform time). Note that, as with 1D discontinuous functions, the decay with $k:=|{\bf k}|$ is slow (it is like $1/k$). See the full code `tutorial/contft2d.m `_ also for the study that shows that, for the above ``kmax``, convergence to the tolerance has occurred by ``n=280`` and ``m=70``, needing $N=19600$ nodes. A more efficient set would vary ``m`` with $\theta$. @@ -271,7 +271,7 @@ See the full code `tutorial/contft2d.m `_. diff --git a/docs/tutorial/serieseval.rst b/docs/tutorial/serieseval.rst index 038ca3194..ad8924fe3 100644 --- a/docs/tutorial/serieseval.rst +++ b/docs/tutorial/serieseval.rst @@ -53,7 +53,7 @@ Compare this to a naive calculation (which serves to remind us exactly what sum norm(c-cn,inf) :: - + Elapsed time is 11.679265 seconds. ans = 1.76508266507874e-11 @@ -69,10 +69,10 @@ multithreaded summation. (Naive summation with reversed loop order is even worse .. image:: ../pics/fser1d.png :width: 90% - + See the full code `tutorial/serieseval1d.m `_ which also shows how to evaluate the same series on a uniform grid via the plain FFT. - + 2D Fourier series ~~~~~~~~~~~~~~~~~ @@ -104,7 +104,7 @@ target points you like: :: Elapsed time is 0.092743 seconds. - + 1 million modes to 1 million points in 92 milliseconds on a laptop is decent. We check the math (using a relative error measure) at just one (generic) point: @@ -113,12 +113,12 @@ We check the math (using a relative error measure) at just one (generic) point: j = 1; % do math check on 1st target... c1 = sum(sum(fk.*exp(1i*(k1*x(j)+k2*y(j))))); abs(c1-c(j)) / norm(c,inf) - + :: - + ans = 2.30520830208365e-10 - + Finally we use a colored scatter plot to show the first $10\%$ of the points in the square, and see samples of the underlying random field (reminiscent of WMAP microwave background data): .. code-block:: matlab @@ -126,9 +126,8 @@ Finally we use a colored scatter plot to show the first $10\%$ of the points in jplot = 1:1e5; % indices to plot scatter(x(jplot),y(jplot),1.0,real(c(jplot)),'filled'); axis tight equal xlabel('x'); ylabel('y'); colorbar; title('Re f(x,y)'); - + .. image:: ../pics/fser2d.png :width: 70% - -See the full code `tutorial/serieseval2d.m `_. +See the full code `tutorial/serieseval2d.m `_. diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index eeda37050..a0fa360bf 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -5,39 +5,40 @@ set(EXAMPLES many1d1 simple1d1 simple1d1f - simulplans1d1) + simulplans1d1 +) set(EXAMPLES_OPENMP threadsafe1d1 threadsafe2d2f) set(EXAMPLES_C guru1d1c simple1d1c simple1d1cf) find_library(MATH_LIBRARY m) foreach(EXAMPLE ${EXAMPLES}) - add_executable(${EXAMPLE} ${EXAMPLE}.cpp) - target_compile_features(${EXAMPLE} PRIVATE cxx_std_14) - target_link_libraries(${EXAMPLE} PRIVATE finufft) - if(CMAKE_PROJECT_NAME STREQUAL "FINUFFT") - enable_asan(${EXAMPLE}) - endif() + add_executable(${EXAMPLE} ${EXAMPLE}.cpp) + target_compile_features(${EXAMPLE} PRIVATE cxx_std_14) + target_link_libraries(${EXAMPLE} PRIVATE finufft) + if(CMAKE_PROJECT_NAME STREQUAL "FINUFFT") + enable_asan(${EXAMPLE}) + endif() endforeach() foreach(EXAMPLE ${EXAMPLES_C}) - add_executable(${EXAMPLE} ${EXAMPLE}.c) - target_link_libraries(${EXAMPLE} PRIVATE finufft) - if(CMAKE_PROJECT_NAME STREQUAL "FINUFFT") - enable_asan(${EXAMPLE}) - endif() - if(MATH_LIBRARY) - target_link_libraries(${EXAMPLE} PRIVATE ${MATH_LIBRARY}) - endif() + add_executable(${EXAMPLE} ${EXAMPLE}.c) + target_link_libraries(${EXAMPLE} PRIVATE finufft) + if(CMAKE_PROJECT_NAME STREQUAL "FINUFFT") + enable_asan(${EXAMPLE}) + endif() + if(MATH_LIBRARY) + target_link_libraries(${EXAMPLE} PRIVATE ${MATH_LIBRARY}) + endif() endforeach() if(FINUFFT_USE_OPENMP) - foreach(EXAMPLE ${EXAMPLES_OPENMP}) - add_executable(${EXAMPLE} ${EXAMPLE}.cpp) - target_link_libraries(${EXAMPLE} PRIVATE finufft OpenMP::OpenMP_CXX) - target_compile_features(${EXAMPLE} PRIVATE cxx_std_11) - if(CMAKE_PROJECT_NAME STREQUAL "FINUFFT") - enable_asan(${EXAMPLE}) - endif() - endforeach() + foreach(EXAMPLE ${EXAMPLES_OPENMP}) + add_executable(${EXAMPLE} ${EXAMPLE}.cpp) + target_link_libraries(${EXAMPLE} PRIVATE finufft OpenMP::OpenMP_CXX) + target_compile_features(${EXAMPLE} PRIVATE cxx_std_11) + if(CMAKE_PROJECT_NAME STREQUAL "FINUFFT") + enable_asan(${EXAMPLE}) + endif() + endforeach() endif() diff --git a/examples/cuda/CMakeLists.txt b/examples/cuda/CMakeLists.txt index b9742a865..d08d606b1 100644 --- a/examples/cuda/CMakeLists.txt +++ b/examples/cuda/CMakeLists.txt @@ -1,10 +1,10 @@ file(GLOB example_src "*.cpp") foreach(srcfile ${example_src}) - string(REPLACE ".cpp" "" executable ${srcfile}) - get_filename_component(executable ${executable} NAME) - add_executable(${executable} ${srcfile}) - target_include_directories(${executable} PUBLIC ${CUFINUFFT_INCLUDE_DIRS}) - target_link_libraries(${executable} cufinufft) - target_compile_features(${executable} PRIVATE cxx_std_17) + string(REPLACE ".cpp" "" executable ${srcfile}) + get_filename_component(executable ${executable} NAME) + add_executable(${executable} ${srcfile}) + target_include_directories(${executable} PUBLIC ${CUFINUFFT_INCLUDE_DIRS}) + target_link_libraries(${executable} cufinufft) + target_compile_features(${executable} PRIVATE cxx_std_17) endforeach() diff --git a/fortran/CMakeLists.txt b/fortran/CMakeLists.txt index b02396cfe..64d035964 100644 --- a/fortran/CMakeLists.txt +++ b/fortran/CMakeLists.txt @@ -1,16 +1,27 @@ add_library( - directft OBJECT directft/dirft1d.f directft/dirft1df.f directft/dirft2d.f - directft/dirft2df.f directft/dirft3d.f directft/dirft3df.f) + directft + OBJECT + directft/dirft1d.f + directft/dirft1df.f + directft/dirft2d.f + directft/dirft2df.f + directft/dirft3d.f + directft/dirft3df.f +) -set(FORTRAN_EXAMPLES guru1d1 nufft1d_demo nufft2d_demo nufft2dmany_demo - nufft3d_demo simple1d1) +set(FORTRAN_EXAMPLES + guru1d1 + nufft1d_demo + nufft2d_demo + nufft2dmany_demo + nufft3d_demo + simple1d1 +) foreach(EXAMPLE ${FORTRAN_EXAMPLES}) - add_executable(fort_${EXAMPLE} examples/${EXAMPLE}.f) - add_executable(fort_${EXAMPLE}f examples/${EXAMPLE}f.f) + add_executable(fort_${EXAMPLE} examples/${EXAMPLE}.f) + add_executable(fort_${EXAMPLE}f examples/${EXAMPLE}f.f) - target_link_libraries(fort_${EXAMPLE} PRIVATE directft finufft - ${FINUFFT_FFTLIBS}) - target_link_libraries(fort_${EXAMPLE}f PRIVATE directft finufft - ${FINUFFT_FFTLIBS}) + target_link_libraries(fort_${EXAMPLE} PRIVATE directft finufft ${FINUFFT_FFTLIBS}) + target_link_libraries(fort_${EXAMPLE}f PRIVATE directft finufft ${FINUFFT_FFTLIBS}) endforeach() diff --git a/fortran/cmcl_license.txt b/fortran/cmcl_license.txt index cd94a0ae4..0504ecbb9 100644 --- a/fortran/cmcl_license.txt +++ b/fortran/cmcl_license.txt @@ -8,13 +8,13 @@ Copyright (c) 2009-2014, Leslie Greengard, June-Yub Lee and Zydrunas Gimbutas All rights reserved. Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: +modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. + list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. + and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED @@ -28,5 +28,5 @@ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. The views and conclusions contained in the software and documentation are those -of the authors and should not be interpreted as representing official policies, +of the authors and should not be interpreted as representing official policies, either expressed or implied, of the FreeBSD Project. diff --git a/fortran/directft/README b/fortran/directft/README index 008f329df..bde21642f 100644 --- a/fortran/directft/README +++ b/fortran/directft/README @@ -1,3 +1,3 @@ This directory contains the CMCL NUFFT direct summation implementations, plus single-precision versions by Alex Barnett, 2017. -It also contains the legendary prini.f, which is currently unused. \ No newline at end of file +It also contains the legendary prini.f, which is currently unused. diff --git a/fortran/directft/dirft1d.f b/fortran/directft/dirft1d.f index 4e36286c1..ff964550f 100644 --- a/fortran/directft/dirft1d.f +++ b/fortran/directft/dirft1d.f @@ -1,8 +1,8 @@ -cc Copyright (C) 2004-2009: Leslie Greengard and June-Yub Lee +cc Copyright (C) 2004-2009: Leslie Greengard and June-Yub Lee cc Contact: greengard@cims.nyu.edu cc cc This software is being released under a FreeBSD license -cc (see license.txt in this directory). +cc (see license.txt in this directory). c*********************************************************************** subroutine dirft1d1(nj,xj,cj, iflag, ms,fk) implicit none @@ -13,7 +13,7 @@ subroutine dirft1d1(nj,xj,cj, iflag, ms,fk) c direct computation of nonuniform FFT c c nj -c fk(k1) = SUM cj(j) exp(+/-i k1 xj(j)) +c fk(k1) = SUM cj(j) exp(+/-i k1 xj(j)) c j=1 c c for -ms/2 <= k1 <= (ms-1)/2 @@ -64,8 +64,8 @@ subroutine dirft1d2(nj,xj,cj, iflag, ms,fk) c ---------------------------------------------------------------------- c direct computation of nonuniform FFT c -c cj(j) = SUM fk(k1) exp(+/-i k1 xj(j)) -c k1 +c cj(j) = SUM fk(k1) exp(+/-i k1 xj(j)) +c k1 c for j = 1,...,nj c c where -ms/2 <= k1 <= (ms-1)/2 @@ -109,8 +109,8 @@ subroutine dirft1d3(nj,xj,cj, iflag, nk,sk,fk) c direct computation of nonuniform FFT c c nj -c fk(k) = SUM cj(j) exp(+/-i s(k) xj(j)) -c j=1 +c fk(k) = SUM cj(j) exp(+/-i s(k) xj(j)) +c j=1 c c for k = 1, ..., nk c diff --git a/fortran/directft/dirft1df.f b/fortran/directft/dirft1df.f index 4756670d6..cf0edd527 100644 --- a/fortran/directft/dirft1df.f +++ b/fortran/directft/dirft1df.f @@ -1,4 +1,4 @@ -cc Copyright (C) 2004-2009: Leslie Greengard and June-Yub Lee +cc Copyright (C) 2004-2009: Leslie Greengard and June-Yub Lee cc Contact: greengard@cims.nyu.edu cc cc This software is being released under a FreeBSD license @@ -14,7 +14,7 @@ subroutine dirft1d1f(nj,xj,cj, iflag, ms,fk) c direct computation of nonuniform FFT c c nj -c fk(k1) = SUM cj(j) exp(+/-i k1 xj(j)) +c fk(k1) = SUM cj(j) exp(+/-i k1 xj(j)) c j=1 c c for -ms/2 <= k1 <= (ms-1)/2 @@ -65,8 +65,8 @@ subroutine dirft1d2f(nj,xj,cj, iflag, ms,fk) c ---------------------------------------------------------------------- c direct computation of nonuniform FFT c -c cj(j) = SUM fk(k1) exp(+/-i k1 xj(j)) -c k1 +c cj(j) = SUM fk(k1) exp(+/-i k1 xj(j)) +c k1 c for j = 1,...,nj c c where -ms/2 <= k1 <= (ms-1)/2 @@ -110,8 +110,8 @@ subroutine dirft1d3f(nj,xj,cj, iflag, nk,sk,fk) c direct computation of nonuniform FFT c c nj -c fk(k) = SUM cj(j) exp(+/-i s(k) xj(j)) -c j=1 +c fk(k) = SUM cj(j) exp(+/-i s(k) xj(j)) +c j=1 c c for k = 1, ..., nk c diff --git a/fortran/directft/dirft2d.f b/fortran/directft/dirft2d.f index eecae6392..ad763f8d9 100644 --- a/fortran/directft/dirft2d.f +++ b/fortran/directft/dirft2d.f @@ -1,8 +1,8 @@ -cc Copyright (C) 2004-2009: Leslie Greengard and June-Yub Lee +cc Copyright (C) 2004-2009: Leslie Greengard and June-Yub Lee cc Contact: greengard@cims.nyu.edu -cc +cc cc This software is being released under a FreeBSD license -cc (see license.txt in this directory). +cc (see license.txt in this directory). cc subroutine dirft2d1(nj,xj,yj,cj, iflag, ms,mt,fk) implicit none @@ -175,7 +175,7 @@ subroutine dirft2d3(nj,xj,yj,cj, iflag, nk,sk,tk,fk) c c nj c fk(k) = SUM cj(j) exp(+/-i s(k) xj(j)) exp(+/-i t(k) yj(j)) -c j=1 +c j=1 c c for k = 1, ..., nk c diff --git a/fortran/directft/dirft2df.f b/fortran/directft/dirft2df.f index 8ea5278d4..fe8205f57 100644 --- a/fortran/directft/dirft2df.f +++ b/fortran/directft/dirft2df.f @@ -1,9 +1,9 @@ -cc Copyright (C) 2004-2009: Leslie Greengard and June-Yub Lee +cc Copyright (C) 2004-2009: Leslie Greengard and June-Yub Lee cc Contact: greengard@cims.nyu.edu -cc +cc cc This software is being release%d under a FreeBSD license cc (see license.txt in this directory). -cc Single-prec version Barnett 4/5/17 +cc Single-prec version Barnett 4/5/17 cc subroutine dirft2d1f(nj,xj,yj,cj, iflag, ms,mt,fk) implicit none @@ -176,7 +176,7 @@ subroutine dirft2d3f(nj,xj,yj,cj, iflag, nk,sk,tk,fk) c c nj c fk(k) = SUM cj(j) exp(+/-i s(k) xj(j)) exp(+/-i t(k) yj(j)) -c j=1 +c j=1 c c for k = 1, ..., nk c diff --git a/fortran/directft/dirft3d.f b/fortran/directft/dirft3d.f index 6afbc76b5..bea4ba3b2 100644 --- a/fortran/directft/dirft3d.f +++ b/fortran/directft/dirft3d.f @@ -1,8 +1,8 @@ -cc Copyright (C) 2004-2009: Leslie Greengard and June-Yub Lee +cc Copyright (C) 2004-2009: Leslie Greengard and June-Yub Lee cc Contact: greengard@cims.nyu.edu -cc +cc cc This software is being released under a FreeBSD license -cc (see license.txt in this directory). +cc (see license.txt in this directory). cc ************************************************************************ subroutine dirft3d1(nj,xj,yj,zj,cj,iflag,ms,mt,mu,fk) @@ -18,7 +18,7 @@ subroutine dirft3d1(nj,xj,yj,zj,cj,iflag,ms,mt,mu,fk) c j=1 exp(+/-i k2 yj(j)) * c exp(+/-i k3 zj(j)) c -c for -ms/2 <= k1 <= (ms-1)/2, +c for -ms/2 <= k1 <= (ms-1)/2, c -mt/2 <= k2 <= (mt-1)/2 c -mu/2 <= k3 <= (mu-1)/2 c @@ -120,13 +120,13 @@ subroutine dirft3d2(nj,xj,yj,zj,cj, iflag, ms,mt,mu,fk) c direct computation of nonuniform FFT c c -c cj(j) = SUM SUM SUM fk(k1,k2,k3) exp(+/-i k1 xj(j)) * +c cj(j) = SUM SUM SUM fk(k1,k2,k3) exp(+/-i k1 xj(j)) * c k1 k2 k3 exp(+/-i k2 yj(j)) c exp(+/-i k3 zj(j)) c c for j = 1,...,nj c -c where -ms/2 <= k1 <= (ms-1)/2 +c where -ms/2 <= k1 <= (ms-1)/2 c -mt/2 <= k2 <= (mt-1)/2 c -mu/2 <= k3 <= (mu-1)/2 c @@ -274,4 +274,3 @@ subroutine dirft3d3(nj,xj,yj,zj,cj, iflag, nk,sk,tk,uk,fk) enddo enddo end - diff --git a/fortran/directft/dirft3df.f b/fortran/directft/dirft3df.f index 05e788ce0..094f96a74 100644 --- a/fortran/directft/dirft3df.f +++ b/fortran/directft/dirft3df.f @@ -1,8 +1,8 @@ -cc Copyright (C) 2004-2009: Leslie Greengard and June-Yub Lee +cc Copyright (C) 2004-2009: Leslie Greengard and June-Yub Lee cc Contact: greengard@cims.nyu.edu -cc +cc cc This software is being released under a FreeBSD license -cc (see license.txt in this directory). +cc (see license.txt in this directory). cc Single-prec version Barnett 4/5/17 cc ************************************************************************ @@ -19,7 +19,7 @@ subroutine dirft3d1f(nj,xj,yj,zj,cj,iflag,ms,mt,mu,fk) c j=1 exp(+/-i k2 yj(j)) * c exp(+/-i k3 zj(j)) c -c for -ms/2 <= k1 <= (ms-1)/2, +c for -ms/2 <= k1 <= (ms-1)/2, c -mt/2 <= k2 <= (mt-1)/2 c -mu/2 <= k3 <= (mu-1)/2 c @@ -121,13 +121,13 @@ subroutine dirft3d2f(nj,xj,yj,zj,cj, iflag, ms,mt,mu,fk) c direct computation of nonuniform FFT c c -c cj(j) = SUM SUM SUM fk(k1,k2,k3) exp(+/-i k1 xj(j)) * +c cj(j) = SUM SUM SUM fk(k1,k2,k3) exp(+/-i k1 xj(j)) * c k1 k2 k3 exp(+/-i k2 yj(j)) c exp(+/-i k3 zj(j)) c c for j = 1,...,nj c -c where -ms/2 <= k1 <= (ms-1)/2 +c where -ms/2 <= k1 <= (ms-1)/2 c -mt/2 <= k2 <= (mt-1)/2 c -mu/2 <= k3 <= (mu-1)/2 c @@ -275,4 +275,3 @@ subroutine dirft3d3f(nj,xj,yj,zj,cj, iflag, nk,sk,tk,uk,fk) enddo enddo end - diff --git a/fortran/directft/prini.f b/fortran/directft/prini.f index 31a2536b6..95a838f90 100644 --- a/fortran/directft/prini.f +++ b/fortran/directft/prini.f @@ -132,7 +132,7 @@ subroutine msgmerge(a,b,c) do 1200 i=1,1000 c if(a(i) .eq. ast) goto 1400 - c(i)=a(i) + c(i)=a(i) iadd=i 1200 continue c diff --git a/fortran/examples/guru1d1.f b/fortran/examples/guru1d1.f index 22aed424a..4898001a7 100755 --- a/fortran/examples/guru1d1.f +++ b/fortran/examples/guru1d1.f @@ -4,7 +4,7 @@ c Legacy-style: f77, plus dynamic allocation & derived types from f90. c To compile (linux/GCC) from this directory, use eg (paste to one line): - + c gfortran -fopenmp -I../../include -I/usr/include guru1d1.f c ../../lib/libfinufft.so -lfftw3 -lfftw3_omp -lgomp -lstdc++ -o guru1d1 @@ -30,15 +30,15 @@ program guru1d1 integer ttype,dim,ntrans c to pass null pointers to unused arguments... real*8, pointer :: dummy => null() - + c this is what you use as the "opaque" ptr to ptr to finufft_plan... integer*8 plan c this is how you create the options struct in fortran... type(finufft_opts) opts c or this is if you want default opts, make a null pointer... type(finufft_opts), pointer :: defopts => null() - - + + c how many nonuniform pts M = 1000000 c how many modes (not too much since FFTW_MEASURE slow later) @@ -83,7 +83,7 @@ program guru1d1 endif call finufft_destroy(plan,ier) - + c math test: single output mode with given freq (not array index) k ktest = N/3 fktest = dcmplx(0,0) @@ -133,6 +133,6 @@ program guru1d1 print *,'failed! ier=',ier endif call finufft_destroy(plan,ier) - + stop end diff --git a/fortran/examples/guru1d1f.f b/fortran/examples/guru1d1f.f index 316ad57ec..d95d695ec 100755 --- a/fortran/examples/guru1d1f.f +++ b/fortran/examples/guru1d1f.f @@ -4,7 +4,7 @@ c Legacy-style: f77, plus dynamic allocation & derived types from f90. c To compile (linux/GCC) from this directory, use eg (paste to one line): - + c gfortran-9 -fopenmp -I../../include -I/usr/include guru1d1f.f c -L../../lib -lfinufftf -o guru1d1f @@ -37,7 +37,7 @@ program guru1d1f type(finufft_opts) opts c or this is if you want default opts, make a null pointer... type(finufft_opts), pointer :: defopts => null() - + c how many nonuniform pts M = 200000 c how many modes (not too much since FFTW_MEASURE slow later) @@ -82,7 +82,7 @@ program guru1d1f endif call finufftf_destroy(plan,ier) - + c math test: single output mode with given freq (not array index) k ktest = N/3 fktest = cmplx(0,0) @@ -132,6 +132,6 @@ program guru1d1f print *,'failed! ier=',ier endif call finufftf_destroy(plan,ier) - + stop end diff --git a/fortran/examples/nufft1d_demo.f b/fortran/examples/nufft1d_demo.f index e63a7e434..a57cd6997 100755 --- a/fortran/examples/nufft1d_demo.f +++ b/fortran/examples/nufft1d_demo.f @@ -16,7 +16,7 @@ c program nufft1d_demo implicit none - + c our fortran-header, always needed include 'finufft.fh' c diff --git a/fortran/examples/nufft2d_demo.f b/fortran/examples/nufft2d_demo.f index b37cfeffc..b07c3ea4a 100755 --- a/fortran/examples/nufft2d_demo.f +++ b/fortran/examples/nufft2d_demo.f @@ -16,7 +16,7 @@ c program nufft2d_demo implicit none - + c our fortran-header, always needed include 'finufft.fh' c @@ -113,7 +113,7 @@ program nufft2d_demo print *, ' ier = ',ier call errcomp(fk0,fk1,nk,err) print *, ' type 3 error = ',err - enddo + enddo stop end c diff --git a/fortran/examples/nufft2d_demof.f b/fortran/examples/nufft2d_demof.f index b649f6109..3500b441e 100755 --- a/fortran/examples/nufft2d_demof.f +++ b/fortran/examples/nufft2d_demof.f @@ -16,7 +16,7 @@ c program nufft2d_demof implicit none - + c our fortran-header, always needed include 'finufft.fh' c @@ -108,7 +108,7 @@ program nufft2d_demof print *, ' ier = ',ier call errcomp(fk0,fk1,nk,err) print *, ' type 3 error = ',err - enddo + enddo stop end c diff --git a/fortran/examples/nufft2dmany_demo.f b/fortran/examples/nufft2dmany_demo.f index 605237161..f8a4be981 100755 --- a/fortran/examples/nufft2dmany_demo.f +++ b/fortran/examples/nufft2dmany_demo.f @@ -7,7 +7,7 @@ c c Vectorized (many data vectors) demo type 1,2 by Melody Shih, 2018, c type 3 by Alex Barnett, 2020. Based on nufft2d_demo.f. -c Also see: ../README. +c Also see: ../README. c c Compile with, eg (GCC, multithreaded; paste to a single line): c @@ -29,7 +29,7 @@ program nufft2dmany_demo c for default opts, make a null pointer... type(finufft_opts), pointer :: defopts => null() -c +c c -------------------------------------------------- c create some test data c -------------------------------------------------- @@ -91,7 +91,7 @@ program nufft2dmany_demo c call 2D Type 1 method c ----------------------- c - call finufft2d1many(ntrans,nj,xj,yj,cj,iflag, + call finufft2d1many(ntrans,nj,xj,yj,cj,iflag, & eps,ms,mt,fk1,defopts,ier) do d = 1, ntrans call dirft2d1(nj,xj,yj,cj(1+(d-1)*nj:d*nj),iflag,ms,mt, @@ -134,7 +134,7 @@ program nufft2dmany_demo maxerr = max(maxerr,err) enddo print *, ' max type 3 error = ',err - enddo + enddo stop end c diff --git a/fortran/examples/nufft2dmany_demof.f b/fortran/examples/nufft2dmany_demof.f index d0bf80115..ac5f72abc 100755 --- a/fortran/examples/nufft2dmany_demof.f +++ b/fortran/examples/nufft2dmany_demof.f @@ -7,7 +7,7 @@ c c Vectorized (many data vectors) demo type 1,2 by Melody Shih, 2018, c type 3 & single-prec by Alex Barnett, 2020. Based on nufft2d_demo.f. -c Also see: ../README. +c Also see: ../README. c c Compile with, eg (GCC, multithreaded; paste to a single line): c @@ -17,7 +17,7 @@ c Note: you must link to single-precision build of FINUFFT program nufft2dmany_demof implicit none - + c our fortran-header, always needed include 'finufft.fh' c @@ -86,7 +86,7 @@ program nufft2dmany_demof c call 2D Type 1 method c ----------------------- c - call finufftf2d1many(ntrans,nj,xj,yj,cj,iflag, + call finufftf2d1many(ntrans,nj,xj,yj,cj,iflag, & eps,ms,mt,fk1,defopts,ier) do d = 1, ntrans call dirft2d1f(nj,xj,yj,cj(1+(d-1)*nj:d*nj),iflag,ms,mt, @@ -129,7 +129,7 @@ program nufft2dmany_demof maxerr = max(maxerr,err) enddo print *, ' max type 3 error = ',err - enddo + enddo stop end c diff --git a/fortran/examples/nufft3d_demo.f b/fortran/examples/nufft3d_demo.f index af04afa5f..e8166d6cb 100755 --- a/fortran/examples/nufft3d_demo.f +++ b/fortran/examples/nufft3d_demo.f @@ -28,7 +28,7 @@ program nufft3d_demo complex*16, allocatable :: cj(:),cj0(:),cj1(:),fk0(:),fk1(:) c for default opts, make a null pointer... type(finufft_opts), pointer :: defopts => null() - + c c -------------------------------------------------- c create some test data @@ -122,7 +122,7 @@ program nufft3d_demo print *, ' ier = ',ier call errcomp(fk0,fk1,nk,err) print *, ' type 3 error = ',err - enddo + enddo stop end c diff --git a/fortran/examples/nufft3d_demof.f b/fortran/examples/nufft3d_demof.f index 2e5a9e21f..89cd687b1 100755 --- a/fortran/examples/nufft3d_demof.f +++ b/fortran/examples/nufft3d_demof.f @@ -16,7 +16,7 @@ c program nufft3d_demof implicit none - + c our fortran-header, always needed include 'finufft.fh' @@ -29,7 +29,7 @@ program nufft3d_demof c for default opts, make a null pointer... type(finufft_opts), pointer :: defopts => null() -c +c c -------------------------------------------------- c create some test data c -------------------------------------------------- @@ -117,7 +117,7 @@ program nufft3d_demof print *, ' ier = ',ier call errcomp(fk0,fk1,nk,err) print *, ' type 3 error = ',err - enddo + enddo stop end c diff --git a/fortran/examples/simple1d1.f b/fortran/examples/simple1d1.f index 534f9b255..98ba187d8 100755 --- a/fortran/examples/simple1d1.f +++ b/fortran/examples/simple1d1.f @@ -4,7 +4,7 @@ c Legacy-style: f77, plus dynamic allocation & derived types from f90. c To compile (linux/GCC) from this directory, use eg (paste to one line): - + c gfortran -fopenmp -I../../include simple1d1.f -o simple1d1 c ../../lib/libfinufft.so -lfftw3 -lfftw3_omp -lgomp -lstdc++ @@ -12,7 +12,7 @@ program simple1d1 implicit none - + c our fortran-header, always needed include 'finufft.fh' @@ -29,7 +29,7 @@ program simple1d1 type(finufft_opts) opts c or this is if you want default opts, make a null pointer... type(finufft_opts), pointer :: defopts => null() - + c how many nonuniform pts M = 2000000 c how many modes @@ -77,7 +77,7 @@ program simple1d1 ktestindex = ktest + N/2 + 1 print '("rel err for mode k=",i10," is ",e10.2)',ktest, $ cdabs(fk(ktestindex)-fktest)/fmax - + c do another transform, but now first setting some options... print *,'' print *, 'setting new options, rerun simple interface...' @@ -96,6 +96,6 @@ program simple1d1 else print *,'failed! ier=',ier endif - + stop end diff --git a/fortran/examples/simple1d1.f90 b/fortran/examples/simple1d1.f90 index f173cc6df..f0fb3378a 100755 --- a/fortran/examples/simple1d1.f90 +++ b/fortran/examples/simple1d1.f90 @@ -5,18 +5,18 @@ ! To compile (linux/GCC) from this directory, note the module also has to be ! compiled, eg: - + ! gfortran -fopenmp ../../include/finufft_mod.f90 simple1d1.f90 -o simple1d1_f90 ../../lib/libfinufft.so -lfftw3 -lfftw3_omp -lgomp -lstdc++ ! Alex Barnett, to demo Reinhard Neder f90 module, 1/20/23. program simple1d1 - + ! use the module (which happens to live in ../../include) use finufft_mod - + implicit none - + ! note some inputs are int (int*4) but others BIGINT (int*8) integer ier,iflag integer*8 N,ktest,M,j,k,ktestindex,t1,t2,crate @@ -25,12 +25,12 @@ program simple1d1 parameter (pi=3.141592653589793238462643383279502884197d0) complex*16, allocatable :: cj(:),fk(:) complex*16 fktest - + ! this is how you create the options struct in fortran... type(finufft_opts) opts ! or this is if you want default opts, make a null pointer... type(finufft_opts), pointer :: defopts => null() - + ! how many nonuniform pts M = 2000000 ! how many modes @@ -46,7 +46,7 @@ program simple1d1 xj(j) = pi * dcos(pi*j/M) cj(j) = dcmplx( dsin((100d0*j)/M), dcos(1.0+(50d0*j)/M)) enddo - + call system_clock(t1) ! mandatory parameters to FINUFFT: sign of +-i in NUFFT iflag = 1 @@ -62,7 +62,7 @@ program simple1d1 else print *,'failed! ier=',ier endif - + ! math test: single output mode with given freq (not array index) k ktest = N/3 fktest = dcmplx(0,0) @@ -78,7 +78,7 @@ program simple1d1 ktestindex = ktest + N/2 + 1 print '("rel err for mode k=",i10," is ",e10.2)',ktest, & cdabs(fk(ktestindex)-fktest)/fmax - + ! do another transform, but now first setting some options... print *,'' print *, 'setting new options, rerun simple interface...' @@ -97,6 +97,6 @@ program simple1d1 else print *,'failed! ier=',ier endif - + stop end program simple1d1 diff --git a/fortran/examples/simple1d1f.f b/fortran/examples/simple1d1f.f index 3e2c8106a..f11d01abc 100755 --- a/fortran/examples/simple1d1f.f +++ b/fortran/examples/simple1d1f.f @@ -4,7 +4,7 @@ c Legacy-style: f77, plus dynamic allocation & derived types from f90. c To compile (linux/GCC) from this directory, use eg (paste to one line): - + c gfortran -fopenmp -I../../include simple1d1f.f -o simple1d1f c -L../../lib -lfinufftf @@ -12,7 +12,7 @@ program simple1d1f implicit none - + c our fortran header, always needed... include 'finufft.fh' @@ -29,7 +29,7 @@ program simple1d1f type(finufft_opts) opts c or this is if you want default opts, make a null pointer... type(finufft_opts), pointer :: defopts => null() - + c how many nonuniform pts M = 200000 c how many modes @@ -77,7 +77,7 @@ program simple1d1f ktestindex = ktest + N/2 + 1 print '("rel err for mode k=",i10," is ",e10.2)',ktest, $ cabs(fk(ktestindex)-fktest)/fmax - + c do another transform, but now first setting some options... print *,'' print *, 'setting new options, rerun simple interface...' @@ -97,6 +97,6 @@ program simple1d1f else print *,'failed! ier=',ier endif - + stop end diff --git a/include/cufinufft/contrib/helper_cuda.h b/include/cufinufft/contrib/helper_cuda.h index c3a31bd2b..7aff136df 100644 --- a/include/cufinufft/contrib/helper_cuda.h +++ b/include/cufinufft/contrib/helper_cuda.h @@ -134,7 +134,7 @@ static const char *cufftGetErrorString(cufftResult error) { } template -int check(T result, char const *const func, const char *const file, int const line) { +int check(T result, const char *const func, const char *const file, const int line) { if (result) { fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", file, line, static_cast(result), _cudaGetErrorEnum(result), func); diff --git a/include/cufinufft/impl.h b/include/cufinufft/impl.h index 1be81c1df..26228704a 100644 --- a/include/cufinufft/impl.h +++ b/include/cufinufft/impl.h @@ -611,23 +611,23 @@ int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_ if (d_plan->dim > 0) { const auto ig1 = T(1) / d_plan->type3_params.gam1; const auto C1 = -d_plan->type3_params.C1; - thrust::transform( - thrust::cuda::par.on(stream), d_kx, d_kx + M, d_plan->kx, - [ig1, C1] __host__ __device__(const T x) -> T { return (x + C1) * ig1; }); + thrust::transform(thrust::cuda::par.on(stream), d_kx, d_kx + M, d_plan->kx, + [ig1, C1] __host__ + __device__(const T x) -> T { return (x + C1) * ig1; }); } if (d_plan->dim > 1) { const auto ig2 = T(1) / d_plan->type3_params.gam2; const auto C2 = -d_plan->type3_params.C2; - thrust::transform( - thrust::cuda::par.on(stream), d_ky, d_ky + M, d_plan->ky, - [ig2, C2] __host__ __device__(const T x) -> T { return (x + C2) * ig2; }); + thrust::transform(thrust::cuda::par.on(stream), d_ky, d_ky + M, d_plan->ky, + [ig2, C2] __host__ + __device__(const T x) -> T { return (x + C2) * ig2; }); } if (d_plan->dim > 2) { const auto ig3 = T(1) / d_plan->type3_params.gam3; const auto C3 = -d_plan->type3_params.C3; - thrust::transform( - thrust::cuda::par.on(stream), d_kz, d_kz + M, d_plan->kz, - [ig3, C3] __host__ __device__(const T x) -> T { return (x + C3) * ig3; }); + thrust::transform(thrust::cuda::par.on(stream), d_kz, d_kz + M, d_plan->kz, + [ig3, C3] __host__ + __device__(const T x) -> T { return (x + C3) * ig3; }); } if (d_plan->type3_params.D1 != 0 || d_plan->type3_params.D2 != 0 || d_plan->type3_params.D3 != 0) { @@ -646,8 +646,8 @@ int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_ const auto realsign = d_plan->iflag >= 0 ? T(1) : T(-1); thrust::transform( thrust::cuda::par.on(stream), iterator, iterator + M, d_plan->prephase, - [D1, D2, D3, realsign] __host__ __device__( - const thrust::tuple &tuple) -> cuda_complex { + [D1, D2, D3, realsign] __host__ + __device__(const thrust::tuple &tuple) -> cuda_complex { const auto x = thrust::get<0>(tuple); const auto y = thrust::get<1>(tuple); const auto z = thrust::get<2>(tuple); @@ -667,23 +667,23 @@ int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_ if (d_plan->dim > 0) { const auto scale = d_plan->type3_params.h1 * d_plan->type3_params.gam1; const auto D1 = -d_plan->type3_params.D1; - thrust::transform( - thrust::cuda::par.on(stream), d_s, d_s + N, d_plan->d_Sp, - [scale, D1] __host__ __device__(const T s) -> T { return scale * (s + D1); }); + thrust::transform(thrust::cuda::par.on(stream), d_s, d_s + N, d_plan->d_Sp, + [scale, D1] __host__ + __device__(const T s) -> T { return scale * (s + D1); }); } if (d_plan->dim > 1) { const auto scale = d_plan->type3_params.h2 * d_plan->type3_params.gam2; const auto D2 = -d_plan->type3_params.D2; - thrust::transform( - thrust::cuda::par.on(stream), d_t, d_t + N, d_plan->d_Tp, - [scale, D2] __host__ __device__(const T t) -> T { return scale * (t + D2); }); + thrust::transform(thrust::cuda::par.on(stream), d_t, d_t + N, d_plan->d_Tp, + [scale, D2] __host__ + __device__(const T t) -> T { return scale * (t + D2); }); } if (d_plan->dim > 2) { const auto scale = d_plan->type3_params.h3 * d_plan->type3_params.gam3; const auto D3 = -d_plan->type3_params.D3; - thrust::transform( - thrust::cuda::par.on(stream), d_u, d_u + N, d_plan->d_Up, - [scale, D3] __host__ __device__(const T u) -> T { return scale * (u + D3); }); + thrust::transform(thrust::cuda::par.on(stream), d_u, d_u + N, d_plan->d_Up, + [scale, D3] __host__ + __device__(const T u) -> T { return scale * (u + D3); }); } { // here we declare phi_hat1, phi_hat2, and phi_hat3 // and the precomputed data for the fseries kernel @@ -738,16 +738,16 @@ int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_ dim > 1 ? phi_hat2.begin() : phi_hat1.begin(), // to avoid out of bounds access, use phi_hat1 if dim < 3 dim > 2 ? phi_hat3.begin() : phi_hat1.begin())); - thrust::transform( - thrust::cuda::par.on(stream), phi_hat_iterator, phi_hat_iterator + N, - d_plan->deconv, - [dim] __host__ __device__(const thrust::tuple tuple) -> cuda_complex { - auto phiHat = thrust::get<0>(tuple); - // in case dim < 2 or dim < 3, multiply by 1 - phiHat *= (dim > 1) ? thrust::get<1>(tuple) : T(1); - phiHat *= (dim > 2) ? thrust::get<2>(tuple) : T(1); - return {T(1) / phiHat, T(0)}; - }); + thrust::transform(thrust::cuda::par.on(stream), phi_hat_iterator, + phi_hat_iterator + N, d_plan->deconv, + [dim] __host__ + __device__(const thrust::tuple tuple) -> cuda_complex { + auto phiHat = thrust::get<0>(tuple); + // in case dim < 2 or dim < 3, multiply by 1 + phiHat *= (dim > 1) ? thrust::get<1>(tuple) : T(1); + phiHat *= (dim > 2) ? thrust::get<2>(tuple) : T(1); + return {T(1) / phiHat, T(0)}; + }); if (is_c_finite && is_c_nonzero) { const auto c1 = d_plan->type3_params.C1; @@ -764,9 +764,9 @@ int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_ thrust::transform( thrust::cuda::par.on(stream), phase_iterator, phase_iterator + N, d_plan->deconv, d_plan->deconv, - [c1, c2, c3, d1, d2, d3, realsign] __host__ __device__( - const thrust::tuple tuple, cuda_complex deconv) - -> cuda_complex { + [c1, c2, c3, d1, d2, d3, realsign] __host__ + __device__(const thrust::tuple tuple, cuda_complex deconv) + -> cuda_complex { // d2 and d3 are 0 if dim < 2 and dim < 3 const auto phase = c1 * (thrust::get<0>(tuple) + d1) + c2 * (thrust::get<1>(tuple) + d2) + diff --git a/include/cufinufft/spreadinterp.h b/include/cufinufft/spreadinterp.h index aebf4bb0b..809ae8306 100644 --- a/include/cufinufft/spreadinterp.h +++ b/include/cufinufft/spreadinterp.h @@ -10,7 +10,7 @@ namespace cufinufft { namespace spreadinterp { template -static __forceinline__ __device__ constexpr T cudaFMA(const T a, const T b, const T c) { +static constexpr __forceinline__ __device__ T cudaFMA(const T a, const T b, const T c) { if constexpr (std::is_same_v) { // fused multiply-add, round to nearest even return __fmaf_rn(a, b, c); diff --git a/include/cufinufft/types.h b/include/cufinufft/types.h index 5b2fba790..d36613833 100644 --- a/include/cufinufft/types.h +++ b/include/cufinufft/types.h @@ -100,10 +100,10 @@ template struct cufinufft_plan_t { cudaStream_t stream; }; -template constexpr static inline cufftType_t cufft_type(); -template<> constexpr inline cufftType_t cufft_type() { return CUFFT_C2C; } +template static inline constexpr cufftType_t cufft_type(); +template<> inline constexpr cufftType_t cufft_type() { return CUFFT_C2C; } -template<> constexpr inline cufftType_t cufft_type() { return CUFFT_Z2Z; } +template<> inline constexpr cufftType_t cufft_type() { return CUFFT_Z2Z; } static inline cufftResult cufft_ex(cufftHandle plan, cufftComplex *idata, cufftComplex *odata, int direction) { diff --git a/matlab/CMakeLists.txt b/matlab/CMakeLists.txt index 92bdff37c..a35db66ae 100644 --- a/matlab/CMakeLists.txt +++ b/matlab/CMakeLists.txt @@ -7,7 +7,6 @@ file(GLOB FINUFFT_MATLAB_M_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/*.m) add_custom_command( TARGET finufft_mex POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy - ${FINUFFT_MATLAB_M_SOURCES} - ${CMAKE_CURRENT_BINARY_DIR}/$ - VERBATIM) + COMMAND ${CMAKE_COMMAND} -E copy ${FINUFFT_MATLAB_M_SOURCES} ${CMAKE_CURRENT_BINARY_DIR}/$ + VERBATIM +) diff --git a/matlab/finufft1d1.m b/matlab/finufft1d1.m index c24e85642..0bb2bc739 100644 --- a/matlab/finufft1d1.m +++ b/matlab/finufft1d1.m @@ -43,7 +43,6 @@ % * For more details about the opts fields, see ../docs/opts.rst % * See ERRHANDLER, VALID_* and FINUFFT_PLAN for possible warning/error IDs. % * Full documentation is online at http://finufft.readthedocs.io - function f = finufft1d1(x,c,isign,eps,ms,o) if nargin<6, o.dummy=1; end % make a dummy options struct diff --git a/matlab/finufft1d2.m b/matlab/finufft1d2.m index 37f8f47cb..13e904bfc 100644 --- a/matlab/finufft1d2.m +++ b/matlab/finufft1d2.m @@ -42,7 +42,6 @@ % * For more details about the opts fields, see ../docs/opts.rst % * See ERRHANDLER, VALID_* and FINUFFT_PLAN for possible warning/error IDs. % * Full documentation is online at http://finufft.readthedocs.io - function c = finufft1d2(x,isign,eps,f,o) if nargin<5, o.dummy=1; end diff --git a/matlab/finufft1d3.m b/matlab/finufft1d3.m index c1586e5ca..3d53bf324 100644 --- a/matlab/finufft1d3.m +++ b/matlab/finufft1d3.m @@ -40,7 +40,6 @@ % * For more details about the opts fields, see ../docs/opts.rst % * See ERRHANDLER, VALID_* and FINUFFT_PLAN for possible warning/error IDs. % * Full documentation is online at http://finufft.readthedocs.io - function f = finufft1d3(x,c,isign,eps,s,o) if nargin<6, o.dummy=1; end diff --git a/matlab/finufft2d1.m b/matlab/finufft2d1.m index 76feb4d2d..2b1c874fb 100644 --- a/matlab/finufft2d1.m +++ b/matlab/finufft2d1.m @@ -48,7 +48,6 @@ % * For more details about the opts fields, see ../docs/opts.rst % * See ERRHANDLER, VALID_* and FINUFFT_PLAN for possible warning/error IDs. % * Full documentation is online at http://finufft.readthedocs.io - function f = finufft2d1(x,y,c,isign,eps,ms,mt,o) if nargin<8, o.dummy=1; end diff --git a/matlab/finufft2d2.m b/matlab/finufft2d2.m index 3a3d2d768..1a48692c5 100644 --- a/matlab/finufft2d2.m +++ b/matlab/finufft2d2.m @@ -44,7 +44,6 @@ % * For more details about the opts fields, see ../docs/opts.rst % * See ERRHANDLER, VALID_* and FINUFFT_PLAN for possible warning/error IDs. % * Full documentation is online at http://finufft.readthedocs.io - function c = finufft2d2(x,y,isign,eps,f,o) if nargin<6, o.dummy=1; end diff --git a/matlab/finufft2d3.m b/matlab/finufft2d3.m index daba6620e..8376e756c 100644 --- a/matlab/finufft2d3.m +++ b/matlab/finufft2d3.m @@ -41,7 +41,6 @@ % * For more details about the opts fields, see ../docs/opts.rst % * See ERRHANDLER, VALID_* and FINUFFT_PLAN for possible warning/error IDs. % * Full documentation is online at http://finufft.readthedocs.io - function f = finufft2d3(x,y,c,isign,eps,s,t,o) if nargin<8, o.dummy=1; end diff --git a/matlab/finufft3d1.m b/matlab/finufft3d1.m index 71cf358cb..29505a901 100644 --- a/matlab/finufft3d1.m +++ b/matlab/finufft3d1.m @@ -50,7 +50,6 @@ % * For more details about the opts fields, see ../docs/opts.rst % * See ERRHANDLER, VALID_* and FINUFFT_PLAN for possible warning/error IDs. % * Full documentation is online at http://finufft.readthedocs.io - function f = finufft3d1(x,y,z,c,isign,eps,ms,mt,mu,o) if nargin<10, o.dummy=1; end diff --git a/matlab/finufft3d2.m b/matlab/finufft3d2.m index ba1ab61f5..358a39de9 100644 --- a/matlab/finufft3d2.m +++ b/matlab/finufft3d2.m @@ -46,7 +46,6 @@ % * For more details about the opts fields, see ../docs/opts.rst % * See ERRHANDLER, VALID_* and FINUFFT_PLAN for possible warning/error IDs. % * Full documentation is online at http://finufft.readthedocs.io - function c = finufft3d2(x,y,z,isign,eps,f,o) if nargin<7, o.dummy=1; end diff --git a/matlab/finufft3d3.m b/matlab/finufft3d3.m index a18129b7f..6e4b9d7b7 100644 --- a/matlab/finufft3d3.m +++ b/matlab/finufft3d3.m @@ -42,7 +42,6 @@ % * For more details about the opts fields, see ../docs/opts.rst % * See ERRHANDLER, VALID_* and FINUFFT_PLAN for possible warning/error IDs. % * Full documentation is online at http://finufft.readthedocs.io - function f = finufft3d3(x,y,z,c,isign,eps,s,t,u,o) if nargin<10, o.dummy=1; end @@ -52,4 +51,3 @@ p = finufft_plan(3,3,isign,n_transf,eps,o); p.setpts(x,y,z,s,t,u); f = p.execute(c); - diff --git a/matlab/finufft_plan.docsrc b/matlab/finufft_plan.docsrc index 80934d6f0..b167cb00f 100644 --- a/matlab/finufft_plan.docsrc +++ b/matlab/finufft_plan.docsrc @@ -34,7 +34,7 @@ % Creates a finufft_plan MATLAB object in the guru interface to FINUFFT, of % type 1, 2 or 3, and with given numbers of Fourier modes (unless type 3). % -% Inputs: +% Inputs: % type transform type: 1, 2, or 3 % n_modes_or_dim if type is 1 or 2, the number of Fourier modes in each % dimension: [ms] in 1D, [ms mt] in 2D, or [ms mt mu] in 3D. @@ -138,4 +138,3 @@ OPTS12 % kernel Fourier transforms arrays, etc. % % - diff --git a/matlab/finufft_plan.m b/matlab/finufft_plan.m index 6a841481c..b79c632e4 100644 --- a/matlab/finufft_plan.m +++ b/matlab/finufft_plan.m @@ -150,7 +150,6 @@ % kernel Fourier transforms arrays, etc. % % - classdef finufft_plan < handle properties diff --git a/matlab/notes.docbit b/matlab/notes.docbit index 3a2af5b4c..2d8675344 100644 --- a/matlab/notes.docbit +++ b/matlab/notes.docbit @@ -7,4 +7,3 @@ % * For more details about the opts fields, see ../docs/opts.rst % * See ERRHANDLER, VALID_* and FINUFFT_PLAN for possible warning/error IDs. % * Full documentation is online at http://finufft.readthedocs.io - diff --git a/matlab/test/check_modeords.m b/matlab/test/check_modeords.m index b9b954003..7282c0711 100644 --- a/matlab/test/check_modeords.m +++ b/matlab/test/check_modeords.m @@ -23,7 +23,7 @@ f1 = finufft3d1(x,y,z,c,isign,eps,N1,N2,N3,o); f1 = fftshift(f1); % handles odd dimension lengths Ns same as we do fprintf('\t modeord=1 vs 0: max error = %g\n',norm(f1(:)-f(:),Inf)) -% it's still a mystery how there can be any nonzero difference here - +% it's still a mystery how there can be any nonzero difference here - % maybe the compiler is reordering the calcs, so roundoff appears, somehow? f = randn(N1,N2,N3)+1i*randn(N1,N2,N3); diff --git a/matlab/test/fig_accuracy.m b/matlab/test/fig_accuracy.m index 9a8ddac09..43ed36116 100644 --- a/matlab/test/fig_accuracy.m +++ b/matlab/test/fig_accuracy.m @@ -30,4 +30,3 @@ axis tight; xlabel('tol'); ylabel('err'); %title(sprintf('1d1: (maxerr)/||c||_1, M=%d, N=%d\n',M,N)); title(sprintf('1d1: ||\tilde f - f||_2/||f||_2, M=%d, N=%d\n',M,N)); - diff --git a/matlab/test/guru_setpts_issue.m b/matlab/test/guru_setpts_issue.m index 47daa87ea..60ef13d77 100644 --- a/matlab/test/guru_setpts_issue.m +++ b/matlab/test/guru_setpts_issue.m @@ -36,7 +36,7 @@ yy = -y; plan.setpts(xx, yy); vals3 = plan.execute(coeffs); - + if ( any(isnan(vals2)) || norm(vals - vals2) > tol ) warning('Something went wrong during run #%i', k); fprintf('norm(vals - vals2) = %g\n', norm(vals - vals2)); diff --git a/matlab/valid_setpts.m b/matlab/valid_setpts.m index 0b0eef28a..3cea5179f 100644 --- a/matlab/valid_setpts.m +++ b/matlab/valid_setpts.m @@ -26,7 +26,7 @@ if ~isvector(t), error('FINUFFT:badTshape','FINUFFT t must be a vector'); end if numel(t)~=nk, error('FINUFFT:badTlen','FINUFFT t must have same length as s'); end end -end +end if dim>2 if ~isvector(z), error('FINUFFT:badZshape','FINUFFT z must be a vector'); end if numel(z)~=nj, error('FINUFFT:badZlen','FINUFFT z must have same length as x'); end @@ -34,4 +34,4 @@ if ~isvector(u), error('FINUFFT:badUshape','FINUFFT u must be a vector'); end if numel(u)~=nk, error('FINUFFT:badUlen','FINUFFT u must have same length as s'); end end -end +end diff --git a/perftest/CMakeLists.txt b/perftest/CMakeLists.txt index f0215a6df..e031cc807 100644 --- a/perftest/CMakeLists.txt +++ b/perftest/CMakeLists.txt @@ -2,26 +2,26 @@ set(PERFTESTS guru_timing_test manysmallprobs spreadtestnd spreadtestndall) foreach(TEST ${PERFTESTS}) - add_executable(${TEST} ${TEST}.cpp) - if(FINUFFT_USE_DUCC0) - target_compile_definitions(${TEST} PRIVATE -DFINUFFT_USE_DUCC0) - endif() - finufft_link_test(${TEST}) + add_executable(${TEST} ${TEST}.cpp) + if(FINUFFT_USE_DUCC0) + target_compile_definitions(${TEST} PRIVATE -DFINUFFT_USE_DUCC0) + endif() + finufft_link_test(${TEST}) - add_executable(${TEST}f ${TEST}.cpp) - target_compile_definitions(${TEST}f PRIVATE -DSINGLE) - if(FINUFFT_USE_DUCC0) - target_compile_definitions(${TEST}f PRIVATE -DFINUFFT_USE_DUCC0) - endif() - finufft_link_test(${TEST}f) + add_executable(${TEST}f ${TEST}.cpp) + target_compile_definitions(${TEST}f PRIVATE -DSINGLE) + if(FINUFFT_USE_DUCC0) + target_compile_definitions(${TEST}f PRIVATE -DFINUFFT_USE_DUCC0) + endif() + finufft_link_test(${TEST}f) endforeach() include(CheckIncludeFile) check_include_file("getopt.h" HAVE_GETOPT_H) if(HAVE_GETOPT_H) - add_executable(perftest perftest.cpp) - if(FINUFFT_USE_DUCC0) - target_compile_definitions(perftest PRIVATE -DFINUFFT_USE_DUCC0) - endif() - finufft_link_test(perftest) + add_executable(perftest perftest.cpp) + if(FINUFFT_USE_DUCC0) + target_compile_definitions(perftest PRIVATE -DFINUFFT_USE_DUCC0) + endif() + finufft_link_test(perftest) endif() diff --git a/perftest/checkGuruTiming.sh b/perftest/checkGuruTiming.sh index ba7395bb5..2ac2d1829 100755 --- a/perftest/checkGuruTiming.sh +++ b/perftest/checkGuruTiming.sh @@ -31,10 +31,8 @@ do modeNum2=${modes[index+1]} modeNum3=${modes[index+2]} - echo "./guru_timing_test ${n_trials} ${type} ${dimension} ${modeNum1} ${modeNum2} ${modeNum3} ${srcpts} ${tolerance} ${debug}" + echo "./guru_timing_test ${n_trials} ${type} ${dimension} ${modeNum1} ${modeNum2} ${modeNum3} ${srcpts} ${tolerance} ${debug}" ./guru_timing_test ${n_trials} ${type} ${dimension} ${modeNum1} ${modeNum2} ${modeNum3} ${srcpts} ${tolerance} ${debug} done done done - - diff --git a/perftest/compare_spreads.jl b/perftest/compare_spreads.jl index a2b374821..658801496 100644 --- a/perftest/compare_spreads.jl +++ b/perftest/compare_spreads.jl @@ -15,7 +15,7 @@ function run_spread(repo,dim,M,N,tols,nthr,prec) if prec==Float64 exec = "$repo/perftest/spreadtestnd" elseif prec==Float32 - exec = "$repo/perftest/spreadtestndf" + exec = "$repo/perftest/spreadtestndf" else error("prec not known!") end times = zeros(2,length(tols)) # spread col 1; interp col 2 @@ -82,9 +82,9 @@ if compute ts[:,i,ntolsd+1:end,2] = run_spread(repo2,dim,M,N,tolsf,nthr,Float32) println(ts[:,i,:,:]) end - #tolstr = [[@sprintf "%.0e" tol for tol=tolsd]; [@sprintf "%.0ef" tol for tol=tolsf]] + #tolstr = [[@sprintf "%.0e" tol for tol=tolsd]; [@sprintf "%.0ef" tol for tol=tolsf]] # strings for w (nspread) for plotting... - wstr = [[@sprintf "%d" -log10(tol)+1 for tol=tolsd]; [@sprintf "%df" -log10(tol)+1 for tol=tolsf]] + wstr = [[@sprintf "%d" -log10(tol)+1 for tol=tolsd]; [@sprintf "%df" -log10(tol)+1 for tol=tolsf]] jldsave("$(fnam).jld2"; fnam,ts,wstr,dims,M,N,tolsd,tolsf,nthr) # save all plot_all(fnam,ts,wstr,dims,M,N,nthr) else diff --git a/perftest/cuda/CMakeLists.txt b/perftest/cuda/CMakeLists.txt index 92c870cc5..fdf04e723 100644 --- a/perftest/cuda/CMakeLists.txt +++ b/perftest/cuda/CMakeLists.txt @@ -2,11 +2,12 @@ add_executable(cuperftest cuperftest.cu) target_include_directories(cuperftest PUBLIC ${CUFINUFFT_INCLUDE_DIRS}) target_link_libraries(cuperftest PUBLIC cufinufft) target_compile_features(cuperftest PRIVATE cxx_std_17) -target_compile_options(cuperftest - PRIVATE $<$:--extended-lambda>) +target_compile_options(cuperftest PRIVATE $<$:--extended-lambda>) set_target_properties( - cuperftest - PROPERTIES LINKER_LANGUAGE CUDA - CUDA_ARCHITECTURES "${FINUFFT_CUDA_ARCHITECTURES}" - CUDA_STANDARD 17 - CUDA_STANDARD_REQUIRED ON) + cuperftest + PROPERTIES + LINKER_LANGUAGE CUDA + CUDA_ARCHITECTURES "${FINUFFT_CUDA_ARCHITECTURES}" + CUDA_STANDARD 17 + CUDA_STANDARD_REQUIRED ON +) diff --git a/perftest/cuda/cuperftest.cu b/perftest/cuda/cuperftest.cu index f72ffb3e6..777fd48ab 100644 --- a/perftest/cuda/cuperftest.cu +++ b/perftest/cuda/cuperftest.cu @@ -302,10 +302,10 @@ int main(int argc, char *argv[]) { std::cout << "Valid options:\n" " --prec \n" " float or double precision. i.e. 'f' or 'd'\n" - " default: " << default_opts.prec << "\n" << + " default: " << default_opts.prec << "\n" << " --type \n" " type of transform. 1 or 2\n" - " default: " << default_opts.type << "\n" << + " default: " << default_opts.type << "\n" << " --n_runs \n" " number of runs to average performance over\n" " default: " << default_opts.n_runs << "\n" << diff --git a/perftest/mymaxthreads.sh b/perftest/mymaxthreads.sh index 1f970f274..ab0b5fed3 100755 --- a/perftest/mymaxthreads.sh +++ b/perftest/mymaxthreads.sh @@ -15,7 +15,7 @@ case "${unameOut}" in # not sure this correct... echo "$NUMBER_OF_PROCESSORS" ;; - *) + *) echo "I'm in an unknown or unsupported operating system: ${unameOut}" >&2 ;; esac diff --git a/perftest/mynumcores.sh b/perftest/mynumcores.sh index 962fa6d3f..780b37681 100755 --- a/perftest/mynumcores.sh +++ b/perftest/mynumcores.sh @@ -20,7 +20,7 @@ case "${unameOut}" in # not sure this is correct... echo "$NUMBER_OF_PROCESSORS" ;; - *) + *) echo "I'm in an unknown or unsupported operating system: ${unameOut}" >&2 ;; esac diff --git a/perftest/results/gcc_vs_icc_xeon.txt b/perftest/results/gcc_vs_icc_xeon.txt index 212436671..ae3ebae8d 100644 --- a/perftest/results/gcc_vs_icc_xeon.txt +++ b/perftest/results/gcc_vs_icc_xeon.txt @@ -8,7 +8,7 @@ Timings in sec: single-thread: t1 t2 - + GCC ICC GCC ICC 1d 2.7 1.7 3.3 2.7 @@ -22,4 +22,3 @@ single-thread: 3d .82 .62 .76 .58 Note: sorting a bit slower under ICC, but spreading is faster. - diff --git a/perftest/searchForTimeMetrics.py b/perftest/searchForTimeMetrics.py index c7e450b70..983b38809 100644 --- a/perftest/searchForTimeMetrics.py +++ b/perftest/searchForTimeMetrics.py @@ -3,7 +3,7 @@ decimalMatchString = "\d+\.?\d+" #regular expression to match a decimal number sciNotString = "(\d*.?\d*e-\d* s)" #regular expression to match a number in scientific notation -wholeNumberMatchString = "\d+" +wholeNumberMatchString = "\d+" #search string needs to have two groupings! (one for everything besides) (time s) @@ -25,12 +25,10 @@ def sumAllTime(searchString, strOut): lineMatch = re.findall(searchString,strOut) for match in lineMatch: val = re.search(sciNotString, match[1]) - if(not val): #search failed, try decimal format + if(not val): #search failed, try decimal format val = re.search(decimalMatchString, match[1]) if(not val): val = re.search(wholeNumberMatchString, match[1]) newVal = newVal + float(val.group(0).split('s')[0].strip()) #trim off " s" newVal = round(newVal,5) return newVal - - diff --git a/perftest/spreadbenchmark.py b/perftest/spreadbenchmark.py index 88f8ff30d..d9ed1328e 100644 --- a/perftest/spreadbenchmark.py +++ b/perftest/spreadbenchmark.py @@ -51,7 +51,7 @@ def runCommand(cmd): status, output = commands.getstatusoutput(cmd) print output return output - + # Test runner def runTests(): print "Running tests..." @@ -70,7 +70,7 @@ def runTests(): interp_speed = max(interp_speed, mi.group("speed")) interp_err = max(interp_err, mi.group("err")) spread_speed = max(spread_speed, ms.group("speed")) - spread_err = max(spread_err, ms.group("err")) + spread_err = max(spread_err, ms.group("err")) results.append({"cmd":cmd, "interp_speed":interp_speed, "interp_err":interp_err, @@ -90,7 +90,7 @@ def checkoutandmake(commit): runCommand("git checkout %s" % commit) print "Making..." print commands.getoutput("make clean test/spreadtestnd " + makeflags) - + def restore(commit, home): if commit == "local": # We just tested local, so do nothing @@ -98,16 +98,16 @@ def restore(commit, home): else: # Return home and pop stash print "Checking out %s and popping stash..." % home - runCommand("git checkout %s" % home) + runCommand("git checkout %s" % home) runCommand("git stash pop") - -# Run tests -print "* Testing %s" % commit_A + +# Run tests +print "* Testing %s" % commit_A checkoutandmake(commit_A) res_A = runTests() restore(commit_A, home) -print "* Testing %s" % commit_B +print "* Testing %s" % commit_B checkoutandmake(commit_B) res_B = runTests() restore(commit_B, home) @@ -132,10 +132,10 @@ def restore(commit, home): spread_speedup = float(res_B[idx]["spread_speed"]) / float(res_A[idx]["spread_speed"])*100 - 100 spread_sign = "+" if spread_speedup > 0 else "" - spread_speedup = spread_sign + "%.1f" % spread_speedup + "%" - interp_speedup = float(res_B[idx]["interp_speed"]) / float(res_A[idx]["interp_speed"])*100 - 100 + spread_speedup = spread_sign + "%.1f" % spread_speedup + "%" + interp_speedup = float(res_B[idx]["interp_speed"]) / float(res_A[idx]["interp_speed"])*100 - 100 interp_sign = "+" if interp_speedup > 0 else "" - interp_speedup = interp_sign + "%.1f" % interp_speedup + "%" + interp_speedup = interp_sign + "%.1f" % interp_speedup + "%" print format % ("", spread_speedup, interp_speedup, "", "") print "" diff --git a/perftest/spreadingSchemeStats.py b/perftest/spreadingSchemeStats.py index fce834f92..c6e9e754c 100644 --- a/perftest/spreadingSchemeStats.py +++ b/perftest/spreadingSchemeStats.py @@ -11,19 +11,19 @@ n_trials = 1 #Small problem, sequential Multithreading -cmdStringOne_string = "./finufftGuru_test " + str(n_trials) + " "+ str(type) +" 3 1e1 1e1 1e1 1e4 1e-6 1 0" +cmdStringOne_string = "./finufftGuru_test " + str(n_trials) + " "+ str(type) +" 3 1e1 1e1 1e1 1e4 1e-6 1 0" cmdStringOne_desc = "Small Problem, sequential Multithreaded" #small problem, simulataneous single threaded, nested at end -cmdStringTwo_string = "./finufftGuru_test " + str(n_trials) + " "+ str(type) +" 3 1e1 1e1 1e1 1e4 1e-6 1 1" +cmdStringTwo_string = "./finufftGuru_test " + str(n_trials) + " "+ str(type) +" 3 1e1 1e1 1e1 1e4 1e-6 1 1" cmdStringTwo_desc = "Small Problem, single threaded sort in parallel" #Large problem, sequential Multithreading -cmdStringThree_string = "./finufftGuru_test " + str(n_trials) + " "+ str(type) +" 3 1e2 1e2 1e2 1e7 1e-6 1 0" +cmdStringThree_string = "./finufftGuru_test " + str(n_trials) + " "+ str(type) +" 3 1e2 1e2 1e2 1e7 1e-6 1 0" cmdStringThree_desc = "Large Problem, sequential Multithreaded" #Large problem, simulataneous single threaded, nested at end -cmdStringFour_string = "./finufftGuru_test " + str(n_trials) + " "+ str(type) +" 3 1e2 1e2 1e2 1e7 1e-6 1 1" +cmdStringFour_string = "./finufftGuru_test " + str(n_trials) + " "+ str(type) +" 3 1e2 1e2 1e2 1e7 1e-6 1 1" cmdStringFour_desc = "Large Problem, single threaded sort in parallel" cmdStrings = [cmdStringOne_string, cmdStringTwo_string, cmdStringThree_string, cmdStringFour_string] @@ -35,22 +35,22 @@ print(cmddesc) print(cmdStrings[i]) out = subprocess.run([cmdString[0], cmdString[1], cmdString[2], cmdString[3],cmdString[4], cmdString[5], - cmdString[6], cmdString[7], cmdString[8], cmdString[8], cmdString[10]], + cmdString[6], cmdString[7], cmdString[8], cmdString[8], cmdString[10]], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) - + strOut = out.stdout.decode() #convert bytes to string #print(strOut) #uncomment to see debug output ############################################################################### #Gather Spreading Stats - - #spread (old) / [sort+spread] (new) + + #spread (old) / [sort+spread] (new) newSort = stm.sumAllTime('(.*finufft_setpts.*sort)(.*)', strOut) #if type three, adds together both sorts #collect spreading if any - newSpread = stm.extractTime('(.*finufft.*exec.*spread)(.*)' , strOut) + newSpread = stm.extractTime('(.*finufft.*exec.*spread)(.*)' , strOut) #collect interp if any - newInterp = stm.extractTime('(.*finufft.*exec.*interp)(.*)',strOut) + newInterp = stm.extractTime('(.*finufft.*exec.*interp)(.*)',strOut) #collect the spread timings for each trial of old totalOldSpread = stm.sumAllTime('(.*spread.*ier)(.*)', strOut) #gets spread AND unspread (i.e. interpolation) for all trials diff --git a/perftest/spreadtestnd.sh b/perftest/spreadtestnd.sh index ffb36fde8..b12309b4f 100755 --- a/perftest/spreadtestnd.sh +++ b/perftest/spreadtestnd.sh @@ -23,7 +23,7 @@ else PREC=double ST=./spreadtestnd fi - + echo export OMP_NUM_THREADS=$TESTTHREADS echo "$PREC-precision $OMP_NUM_THREADS-thread tests: #NU = $M, #U = $N, tol = $TOL..." diff --git a/perftest/timingBreakdowns.py b/perftest/timingBreakdowns.py index 3abf92e17..efcdad214 100644 --- a/perftest/timingBreakdowns.py +++ b/perftest/timingBreakdowns.py @@ -11,7 +11,7 @@ ''' A script to run a seris of finufftGuru_tests with varying parameters Captures the stdout, parses it for timing statistics, and graphs speedup -ratio trends. +ratio trends. ''' @@ -26,7 +26,7 @@ modes = [1e6,1,1,1e3,1e3,1,1e2,1e2,1e2] dimensions = [1,2,3] -types = [1,2,3] +types = [1,2,3] #n_trials=[1,10,100] n_trials = [1,10] @@ -84,20 +84,20 @@ for dim in dimensions: for ftype in types: for trial in n_trials: - - print( "./finufftGuru_test "+ str(trial)+ " " + str(ftype)+ " " +str(dim)+ " " + - str(modes[3*(dim-1)])+ " " + str(modes[3*(dim-1)+1])+ " " + str(modes[3*(dim-1)+2])+ " " + - str(M_srcpts)+ " " + str(tolerance) + " " + str(debug)); + + print( "./finufftGuru_test "+ str(trial)+ " " + str(ftype)+ " " +str(dim)+ " " + + str(modes[3*(dim-1)])+ " " + str(modes[3*(dim-1)+1])+ " " + str(modes[3*(dim-1)+2])+ " " + + str(M_srcpts)+ " " + str(tolerance) + " " + str(debug)); #execute the test for this set of parameters - out = subprocess.run(["./finufftGuru_test", str(trial), str(ftype), str(dim), - str(modes[3*(dim-1)]),str(modes[3*(dim-1)+1]), str(modes[3*(dim-1)+2]), - str(M_srcpts), str(tolerance), str(debug)], + out = subprocess.run(["./finufftGuru_test", str(trial), str(ftype), str(dim), + str(modes[3*(dim-1)]),str(modes[3*(dim-1)+1]), str(modes[3*(dim-1)+2]), + str(M_srcpts), str(tolerance), str(debug)], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) strOut = out.stdout.decode() #convert bytes to string print(strOut) - + #parse the output and syphon into data arrays ############################################################################### @@ -133,12 +133,12 @@ totalTimeT3Ratio.append(totalSpeedup) ############################################################################### - - #spread (old) / [sort+spread] (new) + + #spread (old) / [sort+spread] (new) newSort = stm.sumAllTime('(.*finufft_setpts.*sort)(.*)', strOut) #collect spreading if any - newSpread = stm.extractTime('(.*finufft.*exec.*spread)(.*)' , strOut) + newSpread = stm.extractTime('(.*finufft.*exec.*spread)(.*)' , strOut) #collect interp if any newInterp = stm.extractTime('(.*finufft.*exec.*interp)(.*)',strOut) @@ -166,7 +166,7 @@ #fftw_plan(old) / fftw_plan(new) planSciNotString = '(\(\d+\)[ \t]+)(\d*.?\d*e-\d* s)' planDecimalMatchString= '(\(\d+\)[ \t]+)(\d*\.?\d* s)' - planWholeNumberMatchString= '(\(\d+\)[ \t]+)(\d+ s)' + planWholeNumberMatchString= '(\(\d+\)[ \t]+)(\d+ s)' #collect new fftw_plan time new_fftwPlan=0 @@ -178,12 +178,12 @@ if(not fftwPlanVal): fftwPlanVal = re.search(wholeNumberMatchString, lineMatch.group(0)) new_fftwPlan = float(fftwPlanVal.group(2).split('s')[0]) #strip off s - new_fftwPlan = round(new_fftwPlan,5) + new_fftwPlan = round(new_fftwPlan,5) #collect the fftw_plan timings for each trial of old isInitial = True initialLookup = 0 - totalOldfftwPlan=0 + totalOldfftwPlan=0 lineMatch = re.findall('(? - $) + $ +) -set(CUFINUFFT_INCLUDE_DIRS - ${CUFINUFFT_INCLUDE_DIRS} - PARENT_SCOPE) +set(CUFINUFFT_INCLUDE_DIRS ${CUFINUFFT_INCLUDE_DIRS} PARENT_SCOPE) # flush denormals to zero and enable verbose PTXAS output set(FINUFFT_CUDA_FLAGS @@ -39,39 +38,36 @@ set(FINUFFT_CUDA_FLAGS -maxrregcount 64 > - >) + > +) if(FINUFFT_SHARED_LINKING) - add_library(cufinufft SHARED ${PRECISION_INDEPENDENT_SRC} - ${PRECISION_DEPENDENT_SRC}) + add_library(cufinufft SHARED ${PRECISION_INDEPENDENT_SRC} ${PRECISION_DEPENDENT_SRC}) else() - add_library(cufinufft STATIC ${PRECISION_INDEPENDENT_SRC} - ${PRECISION_DEPENDENT_SRC}) - set_target_properties( - cufinufft PROPERTIES POSITION_INDEPENDENT_CODE - ${FINUFFT_POSITION_INDEPENDENT_CODE}) + add_library(cufinufft STATIC ${PRECISION_INDEPENDENT_SRC} ${PRECISION_DEPENDENT_SRC}) + set_target_properties(cufinufft PROPERTIES POSITION_INDEPENDENT_CODE ${FINUFFT_POSITION_INDEPENDENT_CODE}) endif() target_include_directories(cufinufft PUBLIC ${CUFINUFFT_INCLUDE_DIRS}) # set target build location -set_target_properties(cufinufft PROPERTIES LIBRARY_OUTPUT_DIRECTORY - "${PROJECT_BINARY_DIR}") +set_target_properties(cufinufft PROPERTIES LIBRARY_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}") set_target_properties( - cufinufft - PROPERTIES CUDA_ARCHITECTURES "${FINUFFT_CUDA_ARCHITECTURES}" - CUDA_SEPARABLE_COMPILATION ON - CUDA_STANDARD 17 - CUDA_STANDARD_REQUIRED ON - WINDOWS_EXPORT_ALL_SYMBOLS ON - ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}") + cufinufft + PROPERTIES + CUDA_ARCHITECTURES "${FINUFFT_CUDA_ARCHITECTURES}" + CUDA_SEPARABLE_COMPILATION ON + CUDA_STANDARD 17 + CUDA_STANDARD_REQUIRED ON + WINDOWS_EXPORT_ALL_SYMBOLS ON + ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}" +) target_compile_features(cufinufft PRIVATE cxx_std_17) target_compile_options(cufinufft PRIVATE ${FINUFFT_CUDA_FLAGS}) if(WIN32 OR (BUILD_TESTING AND FINUFFT_BUILD_TESTS)) - target_link_libraries(cufinufft PUBLIC CUDA::cudart CUDA::cufft) + target_link_libraries(cufinufft PUBLIC CUDA::cudart CUDA::cufft) else() - target_link_libraries(cufinufft PUBLIC CUDA::cudart_static CUDA::cufft_static) + target_link_libraries(cufinufft PUBLIC CUDA::cudart_static CUDA::cufft_static) endif() file(GLOB CUFINUFFT_PUBLIC_HEADERS "${CMAKE_SOURCE_DIR}/include/cufinufft*.h") -set_target_properties(cufinufft PROPERTIES PUBLIC_HEADER - "${CUFINUFFT_PUBLIC_HEADERS}") +set_target_properties(cufinufft PROPERTIES PUBLIC_HEADER "${CUFINUFFT_PUBLIC_HEADERS}") diff --git a/src/cuda/README b/src/cuda/README index 5b3166d81..dc4189482 100644 --- a/src/cuda/README +++ b/src/cuda/README @@ -18,7 +18,7 @@ the Flatiron Institute, advised by Alex Barnett. This folder contains the main source files of the GPU implementations. - cufinufft.cu - Four main stages of cufinufft API. + Four main stages of cufinufft API. (1) cufinufft_makeplan, (2) cufinufft_setpts, (3) cufinufft_execute, (4) cufinufft_destroy. Also, cufinufft_default_opts may precede stage 1. diff --git a/src/cuda/memtransfer_wrapper.cu b/src/cuda/memtransfer_wrapper.cu index 20bbb6355..2b46b3908 100644 --- a/src/cuda/memtransfer_wrapper.cu +++ b/src/cuda/memtransfer_wrapper.cu @@ -434,13 +434,14 @@ void freegpumemory(cufinufft_plan_t *d_plan) utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id); // Fixes a crash whewre the plan itself is deleted before the stream const auto stream = d_plan->stream; - // Dont clear fw if spreadinterponly for type 1 and 2 as fw belongs to original program (it is d_fk) - if(!d_plan->opts.gpu_spreadinterponly || d_plan->type == 3) + // Dont clear fw if spreadinterponly for type 1 and 2 as fw belongs to original program + // (it is d_fk) + if (!d_plan->opts.gpu_spreadinterponly || d_plan->type == 3) CUDA_FREE_AND_NULL(d_plan->fw, stream, d_plan->supports_pools); CUDA_FREE_AND_NULL(d_plan->fwkerhalf1, stream, d_plan->supports_pools); CUDA_FREE_AND_NULL(d_plan->fwkerhalf2, stream, d_plan->supports_pools); CUDA_FREE_AND_NULL(d_plan->fwkerhalf3, stream, d_plan->supports_pools); - + CUDA_FREE_AND_NULL(d_plan->idxnupts, stream, d_plan->supports_pools); CUDA_FREE_AND_NULL(d_plan->sortidx, stream, d_plan->supports_pools); CUDA_FREE_AND_NULL(d_plan->numsubprob, stream, d_plan->supports_pools); diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 68979f3e5..71b70ef3c 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -34,22 +34,14 @@ if(FINUFFT_USE_DUCC0) endif() target_compile_features(testutils PRIVATE cxx_std_17) finufft_link_test(testutils) -add_test( - NAME run_testutils - COMMAND testutils - WORKING_DIRECTORY ${CMAKE_BINARY_DIR} -) +add_test(NAME run_testutils COMMAND testutils WORKING_DIRECTORY ${CMAKE_BINARY_DIR}) if(NOT FINUFFT_USE_DUCC0) add_executable(fftw_lock_test fftw_lock_test.cpp) target_compile_features(fftw_lock_test PRIVATE cxx_std_17) finufft_link_test(fftw_lock_test) - add_test( - NAME run_fftw_lock_test - COMMAND fftw_lock_test - WORKING_DIRECTORY ${CMAKE_BINARY_DIR} - ) + add_test(NAME run_fftw_lock_test COMMAND fftw_lock_test WORKING_DIRECTORY ${CMAKE_BINARY_DIR}) endif() # Add ctest definitions that run at both precisions... @@ -58,11 +50,7 @@ function(add_tests_with_prec PREC REQ_TOL CHECK_TOL SUFFIX) # as in makefile. This prevents them taking a huge time on a, say, 128-core # Rome node. ... but I don't know platform-indep way to do that! Does anyone? - add_test( - NAME run_basic_pass_fail_${PREC} - COMMAND basicpassfail${SUFFIX} - WORKING_DIRECTORY ${CMAKE_BINARY_DIR} - ) + add_test(NAME run_basic_pass_fail_${PREC} COMMAND basicpassfail${SUFFIX} WORKING_DIRECTORY ${CMAKE_BINARY_DIR}) add_test( NAME run_finufft1d_test_${PREC} @@ -72,52 +60,39 @@ function(add_tests_with_prec PREC REQ_TOL CHECK_TOL SUFFIX) add_test( NAME run_finufft1dmany_test_${PREC} - COMMAND - finufft1dmany_test${SUFFIX} 3 1e2 2e2 ${REQ_TOL} 0 0 0 2 0.0 - ${CHECK_TOL} + COMMAND finufft1dmany_test${SUFFIX} 3 1e2 2e2 ${REQ_TOL} 0 0 0 2 0.0 ${CHECK_TOL} WORKING_DIRECTORY ${CMAKE_BINARY_DIR} ) add_test( NAME run_finufft2d_test_${PREC} - COMMAND - finufft2d_test${SUFFIX} 1e2 1e1 1e3 ${REQ_TOL} 0 2 0.0 ${CHECK_TOL} + COMMAND finufft2d_test${SUFFIX} 1e2 1e1 1e3 ${REQ_TOL} 0 2 0.0 ${CHECK_TOL} WORKING_DIRECTORY ${CMAKE_BINARY_DIR} ) add_test( NAME run_finufft2dmany_test_${PREC} - COMMAND - finufft2dmany_test${SUFFIX} 3 1e2 1e1 1e3 ${REQ_TOL} 0 0 0 2 0.0 - ${CHECK_TOL} + COMMAND finufft2dmany_test${SUFFIX} 3 1e2 1e1 1e3 ${REQ_TOL} 0 0 0 2 0.0 ${CHECK_TOL} WORKING_DIRECTORY ${CMAKE_BINARY_DIR} ) add_test( NAME run_finufft3d_test_${PREC} - COMMAND - finufft3d_test${SUFFIX} 5 10 20 1e2 ${REQ_TOL} 0 2 0.0 ${CHECK_TOL} + COMMAND finufft3d_test${SUFFIX} 5 10 20 1e2 ${REQ_TOL} 0 2 0.0 ${CHECK_TOL} WORKING_DIRECTORY ${CMAKE_BINARY_DIR} ) add_test( NAME run_finufft3dmany_test_${PREC} - COMMAND - finufft3dmany_test${SUFFIX} 2 5 10 20 1e2 ${REQ_TOL} 0 0 0 2 0.0 - ${CHECK_TOL} + COMMAND finufft3dmany_test${SUFFIX} 2 5 10 20 1e2 ${REQ_TOL} 0 0 0 2 0.0 ${CHECK_TOL} WORKING_DIRECTORY ${CMAKE_BINARY_DIR} ) - add_test( - NAME run_dumbinputs_${PREC} - COMMAND dumbinputs${SUFFIX} - WORKING_DIRECTORY ${CMAKE_BINARY_DIR} - ) + add_test(NAME run_dumbinputs_${PREC} COMMAND dumbinputs${SUFFIX} WORKING_DIRECTORY ${CMAKE_BINARY_DIR}) add_test( NAME spreadinterp1d_${PREC} - COMMAND - spreadinterp1d_test${SUFFIX} 1e3 1e3 ${REQ_TOL} 0 2 2.0 ${CHECK_TOL} + COMMAND spreadinterp1d_test${SUFFIX} 1e3 1e3 ${REQ_TOL} 0 2 2.0 ${CHECK_TOL} WORKING_DIRECTORY ${CMAKE_BINARY_DIR} ) endfunction() diff --git a/test/cuda/CMakeLists.txt b/test/cuda/CMakeLists.txt index 2aac08711..01de47cc4 100644 --- a/test/cuda/CMakeLists.txt +++ b/test/cuda/CMakeLists.txt @@ -5,10 +5,7 @@ foreach(srcfile ${test_src}) get_filename_component(executable ${executable} NAME) add_executable(${executable} ${srcfile}) target_include_directories(${executable} PUBLIC ${CUFINUFFT_INCLUDE_DIRS}) - target_compile_options( - ${executable} - PUBLIC $<$:--extended-lambda> - ) + target_compile_options(${executable} PUBLIC $<$:--extended-lambda>) find_library(MathLib m) if(MathLib) target_link_libraries(${executable} PUBLIC cufinufft ${MathLib}) @@ -16,9 +13,7 @@ foreach(srcfile ${test_src}) target_compile_features(${executable} PUBLIC cxx_std_17) set_target_properties( ${executable} - PROPERTIES - LINKER_LANGUAGE CUDA - CUDA_ARCHITECTURES "${FINUFFT_CUDA_ARCHITECTURES}" + PROPERTIES LINKER_LANGUAGE CUDA CUDA_ARCHITECTURES "${FINUFFT_CUDA_ARCHITECTURES}" ) message( STATUS @@ -31,214 +26,154 @@ endforeach() function(add_tests PREC REQ_TOL CHECK_TOL UPSAMP) add_test( NAME cufinufft1d1_test_auto_${PREC}_${UPSAMP} - COMMAND - cufinufft1d_test 0 1 2e3 4e3 ${REQ_TOL} ${CHECK_TOL} ${PREC} - ${UPSAMP} + COMMAND cufinufft1d_test 0 1 2e3 4e3 ${REQ_TOL} ${CHECK_TOL} ${PREC} ${UPSAMP} ) add_test( NAME cufinufft1d1_test_GM_${PREC}_${UPSAMP} - COMMAND - cufinufft1d_test 1 1 2e3 4e3 ${REQ_TOL} ${CHECK_TOL} ${PREC} - ${UPSAMP} + COMMAND cufinufft1d_test 1 1 2e3 4e3 ${REQ_TOL} ${CHECK_TOL} ${PREC} ${UPSAMP} ) add_test( NAME cufinufft1d1_test_SM_${PREC}_${UPSAMP} - COMMAND - cufinufft1d_test 2 1 2e3 4e3 ${REQ_TOL} ${CHECK_TOL} ${PREC} - ${UPSAMP} + COMMAND cufinufft1d_test 2 1 2e3 4e3 ${REQ_TOL} ${CHECK_TOL} ${PREC} ${UPSAMP} ) add_test( NAME cufinufft1d2_test_auto_${PREC}_${UPSAMP} - COMMAND - cufinufft1d_test 0 2 2e3 4e3 ${REQ_TOL} ${CHECK_TOL} ${PREC} - ${UPSAMP} + COMMAND cufinufft1d_test 0 2 2e3 4e3 ${REQ_TOL} ${CHECK_TOL} ${PREC} ${UPSAMP} ) add_test( NAME cufinufft1d2_test_GM_${PREC}_${UPSAMP} - COMMAND - cufinufft1d_test 1 2 2e3 4e3 ${REQ_TOL} ${CHECK_TOL} ${PREC} - ${UPSAMP} + COMMAND cufinufft1d_test 1 2 2e3 4e3 ${REQ_TOL} ${CHECK_TOL} ${PREC} ${UPSAMP} ) add_test( NAME cufinufft1d3_test_auto_${PREC}_${UPSAMP} - COMMAND - cufinufft1d_test 0 3 2e3 4e3 ${REQ_TOL} ${CHECK_TOL} ${PREC} - ${UPSAMP} + COMMAND cufinufft1d_test 0 3 2e3 4e3 ${REQ_TOL} ${CHECK_TOL} ${PREC} ${UPSAMP} ) add_test( NAME cufinufft1d3_test_GM_${PREC}_${UPSAMP} - COMMAND - cufinufft1d_test 1 3 2e3 4e3 ${REQ_TOL} ${CHECK_TOL} ${PREC} - ${UPSAMP} + COMMAND cufinufft1d_test 1 3 2e3 4e3 ${REQ_TOL} ${CHECK_TOL} ${PREC} ${UPSAMP} ) add_test( NAME cufinufft2d1_test_auto_${PREC}_${UPSAMP} - COMMAND - cufinufft2d_test 0 1 1e2 2e2 2e4 ${REQ_TOL} ${CHECK_TOL} ${PREC} - ${UPSAMP} + COMMAND cufinufft2d_test 0 1 1e2 2e2 2e4 ${REQ_TOL} ${CHECK_TOL} ${PREC} ${UPSAMP} ) add_test( NAME cufinufft2d1_test_GM_${PREC}_${UPSAMP} - COMMAND - cufinufft2d_test 1 1 1e2 2e2 2e4 ${REQ_TOL} ${CHECK_TOL} ${PREC} - ${UPSAMP} + COMMAND cufinufft2d_test 1 1 1e2 2e2 2e4 ${REQ_TOL} ${CHECK_TOL} ${PREC} ${UPSAMP} ) add_test( NAME cufinufft2d1_test_SM_${PREC}_${UPSAMP} - COMMAND - cufinufft2d_test 2 1 1e2 2e2 2e4 ${REQ_TOL} ${CHECK_TOL} ${PREC} - ${UPSAMP} + COMMAND cufinufft2d_test 2 1 1e2 2e2 2e4 ${REQ_TOL} ${CHECK_TOL} ${PREC} ${UPSAMP} ) add_test( NAME cufinufft2d2_test_auto_${PREC}_${UPSAMP} - COMMAND - cufinufft2d_test 0 2 1e2 2e2 2e4 ${REQ_TOL} ${CHECK_TOL} ${PREC} - ${UPSAMP} + COMMAND cufinufft2d_test 0 2 1e2 2e2 2e4 ${REQ_TOL} ${CHECK_TOL} ${PREC} ${UPSAMP} ) add_test( NAME cufinufft2d2_test_SM_${PREC}_${UPSAMP} - COMMAND - cufinufft2d_test 2 2 1e2 2e2 2e4 ${REQ_TOL} ${CHECK_TOL} ${PREC} - ${UPSAMP} + COMMAND cufinufft2d_test 2 2 1e2 2e2 2e4 ${REQ_TOL} ${CHECK_TOL} ${PREC} ${UPSAMP} ) add_test( NAME cufinufft2d3_test_auto_${PREC}_${UPSAMP} - COMMAND - cufinufft2d_test 0 3 1e2 2e2 2e4 ${REQ_TOL} ${CHECK_TOL} ${PREC} - ${UPSAMP} + COMMAND cufinufft2d_test 0 3 1e2 2e2 2e4 ${REQ_TOL} ${CHECK_TOL} ${PREC} ${UPSAMP} ) add_test( NAME cufinufft2d3_test_SM_${PREC}_${UPSAMP} - COMMAND - cufinufft2d_test 2 3 1e2 2e2 2e4 ${REQ_TOL} ${CHECK_TOL} ${PREC} - ${UPSAMP} + COMMAND cufinufft2d_test 2 3 1e2 2e2 2e4 ${REQ_TOL} ${CHECK_TOL} ${PREC} ${UPSAMP} ) add_test( NAME cufinufft2d1many_test_GM_${PREC}_${UPSAMP} - COMMAND - cufinufft2dmany_test 1 1 1e2 2e2 5 0 2e4 ${REQ_TOL} ${CHECK_TOL} - ${PREC} ${UPSAMP} + COMMAND cufinufft2dmany_test 1 1 1e2 2e2 5 0 2e4 ${REQ_TOL} ${CHECK_TOL} ${PREC} ${UPSAMP} ) add_test( NAME cufinufft2d1many_test_SM_${PREC}_${UPSAMP} - COMMAND - cufinufft2dmany_test 2 1 1e2 2e2 5 0 2e4 ${REQ_TOL} ${CHECK_TOL} - ${PREC} ${UPSAMP} + COMMAND cufinufft2dmany_test 2 1 1e2 2e2 5 0 2e4 ${REQ_TOL} ${CHECK_TOL} ${PREC} ${UPSAMP} ) add_test( NAME cufinufft2d2many_test_GM_${PREC}_${UPSAMP} - COMMAND - cufinufft2dmany_test 1 2 1e2 2e2 5 0 2e4 ${REQ_TOL} ${CHECK_TOL} - ${PREC} ${UPSAMP} + COMMAND cufinufft2dmany_test 1 2 1e2 2e2 5 0 2e4 ${REQ_TOL} ${CHECK_TOL} ${PREC} ${UPSAMP} ) add_test( NAME cufinufft2d2many_test_SM_${PREC}_${UPSAMP} - COMMAND - cufinufft2dmany_test 2 2 1e2 2e2 5 0 2e4 ${REQ_TOL} ${CHECK_TOL} - ${PREC} ${UPSAMP} + COMMAND cufinufft2dmany_test 2 2 1e2 2e2 5 0 2e4 ${REQ_TOL} ${CHECK_TOL} ${PREC} ${UPSAMP} ) add_test( NAME cufinufft2d3many_test_GM_${PREC}_${UPSAMP} - COMMAND - cufinufft2dmany_test 1 3 1e2 2e2 5 0 2e4 ${REQ_TOL} ${CHECK_TOL} - ${PREC} ${UPSAMP} + COMMAND cufinufft2dmany_test 1 3 1e2 2e2 5 0 2e4 ${REQ_TOL} ${CHECK_TOL} ${PREC} ${UPSAMP} ) add_test( NAME cufinufft2d3many_test_SM_${PREC}_${UPSAMP} - COMMAND - cufinufft2dmany_test 2 3 1e2 2e2 5 0 2e4 ${REQ_TOL} ${CHECK_TOL} - ${PREC} ${UPSAMP} + COMMAND cufinufft2dmany_test 2 3 1e2 2e2 5 0 2e4 ${REQ_TOL} ${CHECK_TOL} ${PREC} ${UPSAMP} ) add_test( NAME cufinufft3d1_test_auto_${PREC}_${UPSAMP} - COMMAND - cufinufft3d_test 0 1 2 5 10 20 ${REQ_TOL} ${CHECK_TOL} ${PREC} - ${UPSAMP} + COMMAND cufinufft3d_test 0 1 2 5 10 20 ${REQ_TOL} ${CHECK_TOL} ${PREC} ${UPSAMP} ) add_test( NAME cufinufft3d1_test_GM_${PREC}_${UPSAMP} - COMMAND - cufinufft3d_test 1 1 2 5 10 20 ${REQ_TOL} ${CHECK_TOL} ${PREC} - ${UPSAMP} + COMMAND cufinufft3d_test 1 1 2 5 10 20 ${REQ_TOL} ${CHECK_TOL} ${PREC} ${UPSAMP} ) if(${PREC} STREQUAL "float") add_test( NAME cufinufft3d1_test_SM_${PREC}_${UPSAMP} - COMMAND - cufinufft3d_test 2 1 2 5 10 20 ${REQ_TOL} ${CHECK_TOL} ${PREC} - ${UPSAMP} + COMMAND cufinufft3d_test 2 1 2 5 10 20 ${REQ_TOL} ${CHECK_TOL} ${PREC} ${UPSAMP} ) add_test( NAME cufinufft3d1_test_block_${PREC}_${UPSAMP} - COMMAND - cufinufft3d_test 4 1 2 5 10 20 ${REQ_TOL} ${CHECK_TOL} ${PREC} - ${UPSAMP} + COMMAND cufinufft3d_test 4 1 2 5 10 20 ${REQ_TOL} ${CHECK_TOL} ${PREC} ${UPSAMP} ) add_test( NAME cufinufft3d2_test_SM_${PREC}_${UPSAMP} - COMMAND - cufinufft3d_test 2 2 2 5 10 20 ${REQ_TOL} ${CHECK_TOL} ${PREC} - ${UPSAMP} + COMMAND cufinufft3d_test 2 2 2 5 10 20 ${REQ_TOL} ${CHECK_TOL} ${PREC} ${UPSAMP} ) add_test( NAME cufinufft3d3_test_SM_${PREC}_${UPSAMP} - COMMAND - cufinufft3d_test 2 3 2 5 10 30 ${REQ_TOL} ${CHECK_TOL} ${PREC} - ${UPSAMP} + COMMAND cufinufft3d_test 2 3 2 5 10 30 ${REQ_TOL} ${CHECK_TOL} ${PREC} ${UPSAMP} ) endif() add_test( NAME cufinufft3d2_test_auto_${PREC}_${UPSAMP} - COMMAND - cufinufft3d_test 0 2 2 5 10 20 ${REQ_TOL} ${CHECK_TOL} ${PREC} - ${UPSAMP} + COMMAND cufinufft3d_test 0 2 2 5 10 20 ${REQ_TOL} ${CHECK_TOL} ${PREC} ${UPSAMP} ) add_test( NAME cufinufft3d2_test_GM_${PREC}_${UPSAMP} - COMMAND - cufinufft3d_test 1 2 2 5 10 20 ${REQ_TOL} ${CHECK_TOL} ${PREC} - ${UPSAMP} + COMMAND cufinufft3d_test 1 2 2 5 10 20 ${REQ_TOL} ${CHECK_TOL} ${PREC} ${UPSAMP} ) add_test( NAME cufinufft3d3_test_auto_${PREC}_${UPSAMP} - COMMAND - cufinufft3d_test 0 3 2 5 10 30 ${REQ_TOL} ${CHECK_TOL} ${PREC} - ${UPSAMP} + COMMAND cufinufft3d_test 0 3 2 5 10 30 ${REQ_TOL} ${CHECK_TOL} ${PREC} ${UPSAMP} ) add_test( NAME cufinufft3d3_test_GM_${PREC}_${UPSAMP} - COMMAND - cufinufft3d_test 1 3 2 3 7 20 ${REQ_TOL} ${CHECK_TOL}*100 ${PREC} - ${UPSAMP} + COMMAND cufinufft3d_test 1 3 2 3 7 20 ${REQ_TOL} ${CHECK_TOL}*100 ${PREC} ${UPSAMP} ) endfunction() diff --git a/test/spreadinterp1d_test.cpp b/test/spreadinterp1d_test.cpp index 981ae70c8..4aecc9c88 100644 --- a/test/spreadinterp1d_test.cpp +++ b/test/spreadinterp1d_test.cpp @@ -63,7 +63,7 @@ int main(int argc, char *argv[]) { CPX kersum = 0.0; for (auto Fk : F) kersum += Fk; // one kernel mass - // generate random nonuniform points (x) and complex strengths (c) + // generate random nonuniform points (x) and complex strengths (c) #pragma omp parallel { unsigned int se = MY_OMP_GET_THREAD_NUM(); // needed for parallel random #s diff --git a/tutorial/README b/tutorial/README index 89e61c2b1..7b037933a 100644 --- a/tutorial/README +++ b/tutorial/README @@ -4,4 +4,3 @@ So far, this is a collection of MATLAB/Octave codes, which make use of utilities in utils/ See the "Tutorials and application demos" section of the documentation. - diff --git a/tutorial/inv1d2.m b/tutorial/inv1d2.m index 179c1b794..3030f92e3 100644 --- a/tutorial/inv1d2.m +++ b/tutorial/inv1d2.m @@ -39,7 +39,7 @@ [f,flag,relres,iter] = pcg(@(f) applyToep(f,vhat), rhs, 1e-6, N); fprintf('CG-Toep relres %.3g done in %d iters, %.3g s\n', relres,iter,toc) end - + yrecon = finufft1d2(x,+1,tol,f); fprintf('\trel l2 resid of Af=y: %.3g\n', norm(yrecon-y)/norm(y)) fprintf('\trel l2 coeff err: %.3g\n', norm(f-ftrue)/norm(ftrue)) diff --git a/tutorial/serieseval1d.m b/tutorial/serieseval1d.m index 222000d68..7838a3c47 100644 --- a/tutorial/serieseval1d.m +++ b/tutorial/serieseval1d.m @@ -43,5 +43,5 @@ hold on; plot(yi,real(fi),'r.'); legend('NU pts','unif pts'); % eye norm is ok % math check: send the unif pts into NUFFT and compare answers -ci = finufft1d2(yi * (2*pi/L),+1,tol,fk); +ci = finufft1d2(yi * (2*pi/L),+1,tol,fk); fprintf('max error on %d unif test pts; %.3g\n',Mp,norm(fi-ci,inf)) diff --git a/tutorial/utils/lgwt.m b/tutorial/utils/lgwt.m index 6b6040374..0218e957b 100644 --- a/tutorial/utils/lgwt.m +++ b/tutorial/utils/lgwt.m @@ -1,7 +1,7 @@ function [x,w]=lgwt(N,a,b) % lgwt.m % -% This script is for computing definite integrals using Legendre-Gauss +% This script is for computing definite integrals using Legendre-Gauss % Quadrature. Computes the Legendre-Gauss nodes and weights on an interval % [a,b] with truncation order N % @@ -12,29 +12,29 @@ % % Written by Greg von Winckel - 02/25/2004 % -% Copyright (c) 2009, Greg von Winckel +% Copyright (c) 2009, Greg von Winckel % All rights reserved. % -% Redistribution and use in source and binary forms, with or without -% modification, are permitted provided that the following conditions are +% Redistribution and use in source and binary forms, with or without +% modification, are permitted provided that the following conditions are % met: % -% * Redistributions of source code must retain the above copyright -% notice, this list of conditions and the following disclaimer. -% * Redistributions in binary form must reproduce the above copyright -% notice, this list of conditions and the following disclaimer in +% * Redistributions of source code must retain the above copyright +% notice, this list of conditions and the following disclaimer. +% * Redistributions in binary form must reproduce the above copyright +% notice, this list of conditions and the following disclaimer in % the documentation and/or other materials provided with the distribution % -% THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -% AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -% IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -% ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -% LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -% CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -% SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -% INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -% CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -% ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +% THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +% AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +% IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +% ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +% LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +% CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +% SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +% INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +% CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +% ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE % POSSIBILITY OF SUCH DAMAGE. % N=N-1; @@ -48,18 +48,18 @@ % using the recursion relation and the Newton-Raphson method y0=2; % Iterate until new points are uniformly within epsilon of old points - while max(abs(y-y0))>eps + while max(abs(y-y0))>eps L(:,1)=1; L(:,2)=y; for k=2:N1 L(:,k+1)=( (2*k-1)*y.*L(:,k)-(k-1)*L(:,k-1) )/k; - end - Lp=(N2)*( L(:,N1)-y.*L(:,N2) )./(1-y.^2); + end + Lp=(N2)*( L(:,N1)-y.*L(:,N2) )./(1-y.^2); y0=y; - y=y0-L(:,N2)./Lp; + y=y0-L(:,N2)./Lp; end % Linear map from[-1,1] to [a,b] - x=flipud((a*(1-y)+b*(1+y))/2); + x=flipud((a*(1-y)+b*(1+y))/2); % Compute the weights w=(b-a)./((1-y.^2).*Lp.^2)*(N2/N1)^2; end From 82b50a6091571ca81228b2b402e027928b5eb9fc Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Tue, 25 Feb 2025 16:42:23 -0500 Subject: [PATCH 2/5] ignoring --- .git-blame-ignore-revs | 1 + 1 file changed, 1 insertion(+) diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs index 72361d1e3..009c21d34 100644 --- a/.git-blame-ignore-revs +++ b/.git-blame-ignore-revs @@ -1,2 +1,3 @@ # Applied clang format to the codebase 884ba427be0c60aa3399d5ea71b0e9e3a7cbf686 +b1e484fb294b2759d3b6b1f68ca0bf5e255b87d1 From b292fb01af57545db97d0c09fd4cccccef6b4b16 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Thu, 27 Feb 2025 11:22:21 -0500 Subject: [PATCH 3/5] fixes in cmake for cuda (#640) Removing duplicated FINUFFT_CUDA_ARCHITECTURES & enabling LTO on CPU --- CHANGELOG | 2 ++ CMakeLists.txt | 29 ++++++++++++++++------------- cmake/utils.cmake | 11 +++++++++++ docs/install_gpu.rst | 2 ++ perftest/cuda/CMakeLists.txt | 2 +- src/cuda/CMakeLists.txt | 20 ++++++++++++-------- test/CMakeLists.txt | 1 - test/cuda/CMakeLists.txt | 4 ++-- 8 files changed, 46 insertions(+), 25 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 4798ab4bd..da32b1b87 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -3,6 +3,8 @@ If not stated, FINUFFT is assumed (cuFINUFFT <=1.3 is listed separately). Master, using release name V 2.4.0 (2/11/25) +* Removed FINUFFT_CUDA_ARCHITECTURES flag, as it was unnecessary duplication. +* Enabled LTO for finufft, nvcc support is flaky at the moment. * Added GPU spread interp only test. Added CPU spread interp only test to cmake * Make attributes private in Python Plan classes and allow read-only access to them using properties (Andén #608). diff --git a/CMakeLists.txt b/CMakeLists.txt index 6a9d93d93..8d21cfc3f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -25,8 +25,6 @@ option(FINUFFT_BUILD_DOCS "Whether to build the FINUFFT documentation" OFF) # if FINUFFT_USE_DUCC0 is ON, the following options are ignored set(FINUFFT_FFTW_LIBRARIES "DEFAULT" CACHE STRING "Specify a custom FFTW library") set(FINUFFT_FFTW_SUFFIX "OpenMP" CACHE STRING "Suffix for FFTW libraries (e.g. OpenMP, Threads etc.)") -# if FINUFFT_USE_CUDA is OFF, the following options are ignored -set(FINUFFT_CUDA_ARCHITECTURES "native" CACHE STRING "CUDA architectures to build for (e.g. 60;70;75;)") # if FINUFFT_USE_CPU is OFF, the following options are ignored set(FINUFFT_ARCH_FLAGS "native" CACHE STRING "Compiler flags for specifying target architecture, defaults to -march=native") # sphinx tag (don't remove): @cmake_opts_end @@ -77,7 +75,8 @@ set(FINUFFT_CXX_FLAGS_DEBUG -ggdb -ggdb3 -Wall - -Wno-sign-compare + -Wextra + -Wpedantic -Wno-unknown-pragmas ) @@ -208,7 +207,12 @@ function(finufft_link_test target) endif() enable_asan(${target}) target_compile_features(${target} PRIVATE cxx_std_17) - set_target_properties(${target} PROPERTIES MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>") + set_target_properties( + ${target} + PROPERTIES + MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>" + POSITION_INDEPENDENT_CODE ${FINUFFT_POSITION_INDEPENDENT_CODE} + ) # disable deprecated warnings for tests if supported if(FINUFFT_HAS_NO_DEPRECATED_DECLARATIONS) target_compile_options(${target} PRIVATE -Wno-deprecated-declarations) @@ -277,16 +281,14 @@ if(FINUFFT_USE_CPU) endif() if(FINUFFT_USE_CUDA) - if(NOT DEFINED FINUFFT_CUDA_ARCHITECTURES) - if(DEFINED CMAKE_CUDA_ARCHITECTURES) - set(FINUFFT_CUDA_ARCHITECTURES "{$CMAKE_CUDA_ARCHITECTURES}") - else() - message( - "FINUFFT WARNING: No CUDA architecture supplied via '-DFINUFFT_CUDA_ARCHITECTURES=...', defaulting to 'native'" - ) - message("See: https://developer.nvidia.com/cuda-gpus for more details on what architecture to supply.") - endif() + if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES) + message( + WARNING + "FINUFFT WARNING: No CUDA architecture supplied via '-DCMAKE_CUDA_ARCHITECTURES=...', defaulting to 'native'" + ) + message(WARNING "See: https://developer.nvidia.com/cuda-gpus for more details on what architecture to supply.") endif() + set(CMAKE_CUDA_ARCHITECTURES "native") enable_language(CUDA) find_package(CUDAToolkit REQUIRED) add_subdirectory(src/cuda) @@ -348,6 +350,7 @@ message(STATUS " FINUFFT_FFTW_SUFFIX: ${FINUFFT_FFTW_SUFFIX}") message(STATUS " FINUFFT_FFTW_LIBRARIES: ${FINUFFT_FFTW_LIBRARIES}") message(STATUS " FINUFFT_ARCH_FLAGS: ${FINUFFT_ARCH_FLAGS}") message(STATUS " FINUFFT_USE_DUCC0: ${FINUFFT_USE_DUCC0}") +message(STATUS " CMAKE_CUDA_ARCHITECTURES: ${CMAKE_CUDA_ARCHITECTURES}") # gersemi: on if(FINUFFT_ENABLE_INSTALL) diff --git a/cmake/utils.cmake b/cmake/utils.cmake index bcbbb5169..b5571605f 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -75,3 +75,14 @@ function(copy_dll source_target destination_target) unset(SOURCE_FILE) unset(DESTINATION_FILE) endfunction() + +include(CheckIPOSupported) +check_ipo_supported(RESULT LTO_SUPPORTED OUTPUT LTO_ERROR) + +if(LTO_SUPPORTED) + message(STATUS "LTO is supported and enabled.") + set(FINUFFT_INTERPROCEDURAL_OPTIMIZATION TRUE) +else() + message(WARNING "LTO is not supported: ${LTO_ERROR}") + set(FINUFFT_INTERPROCEDURAL_OPTIMIZATION FALSE) +endif() diff --git a/docs/install_gpu.rst b/docs/install_gpu.rst index ec6a74b60..0c5b4f312 100644 --- a/docs/install_gpu.rst +++ b/docs/install_gpu.rst @@ -45,6 +45,8 @@ To find out your own device's compute capability without having to look it up on This will return a text string such as ``8.6`` which would incidate ``sm_86`` architecture, thus to use ``CMAKE_CUDA_ARCHITECTURES=86``. +Note that by default the ``CMAKE_CUDA_ARCHITECTURES`` flag is set to ``native``, which means that the code will be compiled for the compute capability of the GPU on which the code is being compiled. +This might not be portable so it is recommended to set this flag explicitly when building for multiple systems. A good alternative is ``all-major`` which will compile for all major compute capabilities. Testing ------- diff --git a/perftest/cuda/CMakeLists.txt b/perftest/cuda/CMakeLists.txt index fdf04e723..6f440ed35 100644 --- a/perftest/cuda/CMakeLists.txt +++ b/perftest/cuda/CMakeLists.txt @@ -7,7 +7,7 @@ set_target_properties( cuperftest PROPERTIES LINKER_LANGUAGE CUDA - CUDA_ARCHITECTURES "${FINUFFT_CUDA_ARCHITECTURES}" + CUDA_ARCHITECTURES "${CMAKE_CUDA_ARCHITECTURES}" CUDA_STANDARD 17 CUDA_STANDARD_REQUIRED ON ) diff --git a/src/cuda/CMakeLists.txt b/src/cuda/CMakeLists.txt index eccebb17a..83eba1c96 100644 --- a/src/cuda/CMakeLists.txt +++ b/src/cuda/CMakeLists.txt @@ -34,18 +34,14 @@ set(FINUFFT_CUDA_FLAGS -fmad=true -restrict --extra-device-vectorization - $<$:-G - -maxrregcount - 64 - > - > + -Xnvlink + --strip-all> ) if(FINUFFT_SHARED_LINKING) add_library(cufinufft SHARED ${PRECISION_INDEPENDENT_SRC} ${PRECISION_DEPENDENT_SRC}) else() add_library(cufinufft STATIC ${PRECISION_INDEPENDENT_SRC} ${PRECISION_DEPENDENT_SRC}) - set_target_properties(cufinufft PROPERTIES POSITION_INDEPENDENT_CODE ${FINUFFT_POSITION_INDEPENDENT_CODE}) endif() target_include_directories(cufinufft PUBLIC ${CUFINUFFT_INCLUDE_DIRS}) # set target build location @@ -54,20 +50,28 @@ set_target_properties(cufinufft PROPERTIES LIBRARY_OUTPUT_DIRECTORY "${PROJECT_B set_target_properties( cufinufft PROPERTIES - CUDA_ARCHITECTURES "${FINUFFT_CUDA_ARCHITECTURES}" + CUDA_ARCHITECTURES "${CMAKE_CUDA_ARCHITECTURES}" CUDA_SEPARABLE_COMPILATION ON CUDA_STANDARD 17 CUDA_STANDARD_REQUIRED ON WINDOWS_EXPORT_ALL_SYMBOLS ON ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}" + INTERPROCEDURAL_OPTIMIZATION + OFF # LTO is not supported for CUDA for now + POSITION_INDEPENDENT_CODE ${FINUFFT_POSITION_INDEPENDENT_CODE} ) target_compile_features(cufinufft PRIVATE cxx_std_17) target_compile_options(cufinufft PRIVATE ${FINUFFT_CUDA_FLAGS}) -if(WIN32 OR (BUILD_TESTING AND FINUFFT_BUILD_TESTS)) +if(WIN32 OR (BUILD_TESTING AND FINUFFT_BUILD_TESTS) OR env{CIBUILDWHEEL}) target_link_libraries(cufinufft PUBLIC CUDA::cudart CUDA::cufft) else() target_link_libraries(cufinufft PUBLIC CUDA::cudart_static CUDA::cufft_static) endif() +# disable deprecated warnings for tests if supported +if(FINUFFT_HAS_NO_DEPRECATED_DECLARATIONS) + target_compile_options(cufinufft PRIVATE $<$:-Wno-deprecated-declarations>) +endif() + file(GLOB CUFINUFFT_PUBLIC_HEADERS "${CMAKE_SOURCE_DIR}/include/cufinufft*.h") set_target_properties(cufinufft PROPERTIES PUBLIC_HEADER "${CUFINUFFT_PUBLIC_HEADERS}") diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 71b70ef3c..7f2184fec 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -40,7 +40,6 @@ if(NOT FINUFFT_USE_DUCC0) add_executable(fftw_lock_test fftw_lock_test.cpp) target_compile_features(fftw_lock_test PRIVATE cxx_std_17) finufft_link_test(fftw_lock_test) - add_test(NAME run_fftw_lock_test COMMAND fftw_lock_test WORKING_DIRECTORY ${CMAKE_BINARY_DIR}) endif() diff --git a/test/cuda/CMakeLists.txt b/test/cuda/CMakeLists.txt index 01de47cc4..3e1bada51 100644 --- a/test/cuda/CMakeLists.txt +++ b/test/cuda/CMakeLists.txt @@ -13,12 +13,12 @@ foreach(srcfile ${test_src}) target_compile_features(${executable} PUBLIC cxx_std_17) set_target_properties( ${executable} - PROPERTIES LINKER_LANGUAGE CUDA CUDA_ARCHITECTURES "${FINUFFT_CUDA_ARCHITECTURES}" + PROPERTIES LINKER_LANGUAGE CUDA CUDA_ARCHITECTURES "${CMAKE_CUDA_ARCHITECTURES}" ) message( STATUS "Adding test ${executable}" - " with CUDA_ARCHITECTURES=${FINUFFT_CUDA_ARCHITECTURES}" + " with CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}" " and INCLUDE=${CUFINUFFT_INCLUDE_DIRS}" ) endforeach() From 30de07872b31030a08944874fd41818e562f0b65 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Thu, 27 Feb 2025 17:55:39 -0500 Subject: [PATCH 4/5] initial upsampfact tweak (#638) Adding two python utils to generate a dataset of performance between upsampfact 2.0 and 1.25 across multiple densities. Using it to populate a heuristic that chooses the best one. In some cases this is a significant speedup. --- CHANGELOG | 2 + devel/analyse_upsamp.py | 65 ++++++ devel/wisdom.py | 199 +++++++++++++++++++ include/finufft/heuristics.hpp | 353 +++++++++++++++++++++++++++++++++ src/finufft_core.cpp | 22 +- 5 files changed, 629 insertions(+), 12 deletions(-) create mode 100644 devel/analyse_upsamp.py create mode 100644 devel/wisdom.py create mode 100644 include/finufft/heuristics.hpp diff --git a/CHANGELOG b/CHANGELOG index da32b1b87..c7112b41f 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -3,6 +3,8 @@ If not stated, FINUFFT is assumed (cuFINUFFT <=1.3 is listed separately). Master, using release name V 2.4.0 (2/11/25) +* Tweaked choice of upsampfact to use a density based heuristic for type 1 and 2 + in the CPU library. This gives a significant speedup for some cases. * Removed FINUFFT_CUDA_ARCHITECTURES flag, as it was unnecessary duplication. * Enabled LTO for finufft, nvcc support is flaky at the moment. * Added GPU spread interp only test. Added CPU spread interp only test to cmake diff --git a/devel/analyse_upsamp.py b/devel/analyse_upsamp.py new file mode 100644 index 000000000..b45bf3236 --- /dev/null +++ b/devel/analyse_upsamp.py @@ -0,0 +1,65 @@ +import pandas as pd +import numpy as np + +# Load the dataset +file_path = "wisdom.csv" # Change this to the actual file path +df = pd.read_csv(file_path) + +# Convert specified columns to numeric, coercing errors to NaN +columns_to_convert = ['Num_pts', 'Time_2.0', 'Time_1.25'] +for col in columns_to_convert: + df[col] = pd.to_numeric(df[col], errors='coerce') + +# Drop rows with NaN values in these columns (optional, but recommended) +df = df.dropna(subset=columns_to_convert) +# scientific notation to Epsilon +df['Epsilon'] = df['Epsilon'].apply(lambda x: f'{x:.1e}') + +# round Time_2.0 and Time_1.25 to 2 decimal places +df['Time_2.0'] = df['Time_2.0'].apply(lambda x: round(x, 2)) +df['Time_1.25'] = df['Time_1.25'].apply(lambda x: round(x, 2)) + + +# Extracting grid dimensions and volume from 'Size' column +def parse_grid_size(size): + try: + size = str(size).strip() + if 'x' in size: + dims = list(map(int, size.split('x'))) + volume = np.prod(dims) + dim = len(dims) + else: + volume = int(size) + dim = 1 + return volume, str(dim) + except ValueError: + return np.nan, np.nan + + +df[['Volume', 'Dim']] = df['Size'].apply(lambda x: pd.Series(parse_grid_size(str(x)))) + +# Compute density +df['Density'] = df['Num_pts'] / df['Volume'] +# remove columns where Time_2.0 is > Time_1.25 +df = df[df['Time_2.0'] <= df['Time_1.25']] + +# drop columns where diff between Time_2.0 and Time_1.25 is less than 5% +df['% Diff'] = abs(df['Time_2.0'] - df['Time_1.25']) / df['Time_1.25'] +df = df[df['% Diff'] >= .05] + +# keep rows where n_threads is 1 +df = df[df['n_threads'] == 16] +# drop time columns +df.drop(columns=['Time_2.0', 'Time_1.25', 'n_threads'], inplace=True) +df.drop(columns=['% Diff', 'Size', 'Num_pts', 'Volume', 'Unnamed: 0'], inplace=True) +# group by NUFFT_type, Data_type, Density, Dim, n_threads to make print more readable, concatenate values do not aggregate them +# Group by specific columns and concatenate values instead of aggregating them +grouped_df = df.groupby(['NUFFT_type', 'Data_type', 'Density', 'Dim'], as_index=False).agg(lambda x: ', '.join(x)) +# grouped_df['Epsilon'] = grouped_df['Epsilon'].apply(lambda x: "{:.1e}".format(float(x))) + +# sort by NUFFT type first, then Dim and then Density +grouped_df = grouped_df.sort_values(by=['NUFFT_type', 'Data_type', 'Dim', 'Density']) +grouped_df = grouped_df[['NUFFT_type', 'Data_type', 'Dim', 'Density', 'Epsilon']] + +# Display the final dataset +print(grouped_df.to_string(index=False)) diff --git a/devel/wisdom.py b/devel/wisdom.py new file mode 100644 index 000000000..9a3baa3ec --- /dev/null +++ b/devel/wisdom.py @@ -0,0 +1,199 @@ +import numpy as np +import finufft +import timeit +import statistics +import matplotlib.pyplot as plt +import os +import pandas as pd +import sys + +sys.stdout.reconfigure(line_buffering=True) # Ensures auto-flushing + + +# Global list for collecting detailed benchmark rows +all_results = [] + +def benchmark_function(func, *args, runs=5, **kwargs): + """Runs the function multiple times and returns average & std dev.""" + runtimes = [ + timeit.timeit(lambda: func(*args, **kwargs), number=1) + for _ in range(runs) + ] + avg_runtime = statistics.mean(runtimes) + stdev_runtime = statistics.stdev(runtimes) if runs > 1 else 0.0 + return avg_runtime, stdev_runtime + +# cache functiojn results with decorator +def generate_random_data(nufft_type, nufft_sizes, num_pts, dtype): + """Generates random NUFFT input data in the correct dtype.""" + dim = len(nufft_sizes) + rng = np.random.Generator(np.random.SFC64(42)) + # Set the correct complex dtype + complex_dtype = np.complex64 if dtype == np.float32 else np.complex128 + + # Generate data for nonuniform points and coefficients + x = [np.array(2 * np.pi * rng.random(num_pts) - np.pi, dtype=dtype) for _ in range(dim)] + c = np.array(rng.random(num_pts) + 1j * rng.random(num_pts), dtype=complex_dtype) + + f = rng.standard_normal(nufft_sizes, dtype=dtype) + 1j * rng.standard_normal(nufft_sizes, dtype=dtype) if nufft_type == 2 else None + d = [np.array(2 * np.pi * rng.random(num_pts) - np.pi, dtype=dtype) for _ in range(dim)] if nufft_type == 3 else None + return x, c, f, d + +def run_nufft(nufft_type, nufft_sizes, epsilon, n_threads, upsampfac, x, c, f, d): + """Runs NUFFT with the correct dtype and parameters.""" + opts = {'nthreads': n_threads, 'upsampfac': upsampfac, 'debug': 0} + dim = len(nufft_sizes) + + if nufft_type == 1: + if dim == 1: + return finufft.nufft1d1(x[0], c, nufft_sizes[0], eps=epsilon, **opts) + elif dim == 2: + return finufft.nufft2d1(x[0], x[1], c, nufft_sizes, eps=epsilon, **opts) + elif dim == 3: + return finufft.nufft3d1(x[0], x[1], x[2], c, nufft_sizes, eps=epsilon, **opts) + elif nufft_type == 2: + if dim == 1: + return finufft.nufft1d2(x[0], f, eps=epsilon, **opts) + elif dim == 2: + return finufft.nufft2d2(x[0], x[1], f, eps=epsilon, **opts) + elif dim == 3: + return finufft.nufft3d2(x[0], x[1], x[2], f, eps=epsilon, **opts) + elif nufft_type == 3: + if dim == 1: + return finufft.nufft1d3(x[0], c, d[0], eps=epsilon, **opts) + elif dim == 2: + return finufft.nufft2d3(x[0], x[1], c, d[0], d[1], eps=epsilon, **opts) + elif dim == 3: + return finufft.nufft3d3(x[0], x[1], x[2], c, d[0], d[1], d[2], eps=epsilon, **opts) + + else: + raise ValueError("Invalid NUFFT type. Use 1, 2, or 3.") + +def benchmark_nufft_collection(nufft_type, nufft_sizes, num_pts, epsilons, n_threads, upsampfacs, dtype, runs=5, + description=""): + """Runs benchmarks while measuring performance across densities and records detailed results.""" + results = {upsamp: [] for upsamp in upsampfacs} + x, c, f, d = generate_random_data(nufft_type, nufft_sizes, num_pts, dtype) + + # Compute density + size_product = np.prod(nufft_sizes) + density = num_pts / size_product + + title = (f"NUFFT Type {nufft_type}, {len(nufft_sizes)}D, {dtype.__name__} " + f"(Size: {'x'.join(map(str, nufft_sizes))}, Num Pts: {num_pts}, Density: {density:.2f}, Threads: {n_threads})") + print(f"\n=== DEBUG: {title} ===") + print(f"{'Epsilon':<10} | {'1.25s':<12} | {'2.0s':<12} | % Diff") + + for epsilon in epsilons: + runtimes = {} + for upsamp in upsampfacs: + avg_runtime, _ = benchmark_function( + run_nufft, nufft_type, nufft_sizes, epsilon, n_threads, upsamp, x, c, f, d, runs=runs + ) + runtimes[upsamp] = avg_runtime + results[upsamp].append(avg_runtime) + + diff = ((runtimes[2.0] - runtimes[1.25]) / runtimes[1.25]) * 100 + print(f"{epsilon:<10.1e} | {runtimes[1.25]:<12.6f} | {runtimes[2.0]:<12.6f} | {diff:+.1f}%") + + # Append a row of results for this epsilon value + all_results.append({ + 'Epsilon': epsilon, + 'Time_1.25': runtimes[1.25], + 'Time_2.0': runtimes[2.0], + '% Diff': diff, + 'NUFFT_type': nufft_type, + 'Data_type': dtype.__name__, + 'Size': 'x'.join(map(str, nufft_sizes)), + 'Num_pts': num_pts, + 'Density': density, + 'n_threads': n_threads + }) + + plot_benchmark_results(results, epsilons, upsampfacs, title, density) + return results + +def plot_benchmark_results(results, epsilons, upsampfacs, title, density): + """Plots performance results while including density information.""" + x_axis = np.arange(len(epsilons)) + width = 0.35 # Bar width + + fig, ax = plt.subplots(figsize=(10, 6)) + colors = ['tab:blue', 'tab:orange'] + + for i, upsamp in enumerate(upsampfacs): + ax.bar(x_axis + (i - 0.5) * width, results[upsamp], width, label=f"upsampfac = {upsamp}", color=colors[i]) + + ax.set_xlabel("Epsilon") + ax.set_ylabel("Average Runtime (s)") + ax.set_title(f"{title} - Density {density:.2f}") + ax.set_xticks(x_axis) + ax.set_xticklabels([f"{eps:.0e}" for eps in epsilons]) + ax.set_yscale("log") + ax.legend(loc='upper left', bbox_to_anchor=(1.05, 1)) + plt.tight_layout() + + os.makedirs("plots", exist_ok=True) + filename = f"plots/{title.replace(' ', '_').replace(',', '').replace(':', '').replace('.', '_')}.png" + plt.savefig(filename) + # plt.show() + +# Define parameters for benchmarking +upsampfacs = [2.0, 1.25] +runs = 5 + +for n_threads in reversed([16]): + # total_elements = 100**3 if n_threads == 1 else 216**3 + total_elements = 100**3 if n_threads == 1 else 216**3 + # Select test dimensions for 1D, 2D, and 3D + size_1d = (int(total_elements),) + size_2d = (int(np.sqrt(total_elements)), int(np.sqrt(total_elements))) + size_3d = (int(np.cbrt(total_elements)), int(np.cbrt(total_elements)), int(np.cbrt(total_elements))) + + # Define num_pts range: starts with 1/16th of volume, ends with 1024x the volume + volume_1d = np.prod(size_1d) + volume_2d = np.prod(size_2d) + volume_3d = np.prod(size_3d) + + num_pts_range = lambda volume: [volume // 16 * (2**i) for i in range(10)] + + + test_cases = [] + for nufft_type in [2, 1]: + for size, desc in [(size_1d, "1D"), (size_2d, "2D"), (size_3d, "3D")]: + for num_pts in reversed(num_pts_range(np.prod(size))): + test_cases.append({ + "nufft_type": nufft_type, + "nufft_sizes": size, + "num_pts": num_pts, + "n_threads": n_threads, + "description": f"NUFFT Type {nufft_type}, {desc}, Threads {n_threads}, Size {'x'.join(map(str, size))}, Num Pts {num_pts}" + }) + +# Run benchmarks and generate plots for each test case and for both float32 and float64. +for case in test_cases: + for dtype in [np.float32, np.float64]: + epsilons = np.logspace(-1, -6, num=6) if dtype == np.float32 else np.logspace(-1, -9, num=9) + + print(f'RUNNING TEST CASE : {case["description"]} with dtype : {dtype.__name__} epsilons : {epsilons}') + benchmark_nufft_collection( + case["nufft_type"], + case["nufft_sizes"], + case["num_pts"], + epsilons, + case["n_threads"], + upsampfacs, + dtype, + runs=runs, + description=case["description"] + ) + +# After all benchmarks are done, build and print the final results table. +df = pd.DataFrame(all_results) +# Reorder columns as desired. +df = df[['Epsilon', 'Time_1.25', 'Time_2.0', '% Diff', 'NUFFT_type', 'Data_type', 'Size', 'Num_pts', 'Density', 'n_threads']] +print("\nFinal Benchmark Results:") +print(df.to_string(index=False)) +df.to_csv('wisdom.csv') +df.to_latex('wisdom.tex') +df.to_markdown('wisdom.md') diff --git a/include/finufft/heuristics.hpp b/include/finufft/heuristics.hpp new file mode 100644 index 000000000..462fab9f6 --- /dev/null +++ b/include/finufft/heuristics.hpp @@ -0,0 +1,353 @@ +#ifndef HEURISTICS_HPP +#define HEURISTICS_HPP + +#include +#include +#include +#include +#include +#include + +namespace finufft::heuristics { +#ifndef FINUFFT_USE_DUCC0 +template +static double bestUpsamplingFactorSinglethread( + const double density, const int dim, const int nufftType, const double epsilon) { + + constexpr bool isFloat = std::is_same_v; + + if constexpr (isFloat) { // single precision + if (nufftType == 1) { + if (dim == 2 && density >= 32 && epsilon <= 1.0e-5) { + return 2.0; + } + if (dim == 3 && density >= 2 && epsilon <= 1.0e-5) { + return 2.0; + } + } // end nufftType == 1 + if (nufftType == 2) { + if (dim == 1 && density >= 8 && epsilon <= 1.0e-5) { + return 2.0; + } + if (dim == 3) { + if (density >= 32 && epsilon <= 1.0e-3) { + return 2.0; + } + if (density >= 4 && epsilon <= 1.0e-4) { + return 2.0; + } + if (density >= 1 && epsilon <= 1.0e-5) { + return 2.0; + } + return 2.0; + } // end dim == 3 + } // end nufftType == 2 + } else { // end single precision, double precision follows + if (nufftType == 1) { + // interestingly for type 1, 1.25 is always faster in 1D + if (dim == 2) { + if (density >= 32 && epsilon <= 1.0e-6) { + return 2.0; + } + if (density >= 16 && epsilon <= 1.0e-7) { + return 2.0; + } + if (density >= 8 && epsilon <= 1.0e-8) { + return 2.0; + } + } // end dim == 2 + if (dim == 3) { + if (density >= 16 && epsilon <= 1.0e-3) { + return 2.0; + } + if (density >= 2 && epsilon <= 1.0e-5) { + return 2.0; + } + if (density >= 1 && epsilon <= 1.0e-7) { + return 2.0; + } + if (epsilon <= 1.0e-8) { // for epsilon <= 1.0e-8, 2.0 is favored in 3D always + return 2.0; + } + } // end dim == 3 + } // end nufftType == 1 + if (nufftType == 2) { + if (dim == 1) { + if (density >= 4 && epsilon <= 1.0e-9) { + return 2.0; + } + } // end dim == 1 + if (dim == 2) { + if (density >= 32 && epsilon <= 1.0e-7) { + return 2.0; + } + } // end dim == 2 + if (dim == 3) { + if (density >= 8 && epsilon <= 1.0e-3) { + return 2.0; + } + if (density >= 4 && epsilon <= 1.0e-4) { + return 2.0; + } + if (density >= 1 && epsilon <= 1.0e-5) { + return 2.0; + } + if (density >= 0.5 && epsilon <= 1.0e-7) { + return 2.0; + } + if (density >= 0.25 && epsilon <= 1.0e-8) { + return 2.0; + } + } // end dim == 3 + } // end nufftType == 2 + } + + return 1.25; +} + +template +static double bestUpsamplingFactorMultithread(const double density, const int dim, + const int nufftType, const double epsilon) { + constexpr bool isFloat = std::is_same_v; + if constexpr (isFloat) { + // in float multi-threaded, this is the only case where 2.0 is faster + if (dim == 3 && nufftType == 1 && epsilon <= 1.0e-6 && density >= 32) { + return 2.0; + } + } else { + // In not 3D, 1.25 is always better + if (dim != 3) return 1.25; + + if (nufftType == 1) { + if (density >= 16 && epsilon <= 1.0e-5) { + return 2.0; + } + if (density >= 8 && epsilon <= 1.0e-7) { + return 2.0; + } + if (density >= 2 && epsilon <= 1.0e-8) { + return 2.0; + } + if (density >= 1 && epsilon <= 1.0e-9) { + return 2.0; + } + } + if (nufftType == 2) { + if (density >= 16 && epsilon <= 1.0e-5) { + return 2.0; + } + if (density >= 8 && epsilon <= 1.0e-6) { + return 2.0; + } + if (density >= 4 && epsilon <= 1.0e-7) { + return 2.0; + } + if (density >= 2 && epsilon <= 1.0e-8) { + return 2.0; + } + if (density >= 1 && epsilon <= 1.0e-9) { + return 2.0; + } + } + } + + return 1.25; +} + +#else + +template +static double bestUpsamplingFactorSinglethread( + const double density, const int dim, const int nufftType, const double epsilon) { + + constexpr bool isFloat = std::is_same_v; + + if constexpr (isFloat) { // single precision + if (nufftType == 1) { + if (dim == 2 && density >= 4 && epsilon <= 1.0e-5) { + return 2.0; + } + // if we got here and it is not 3D return 1.25 + if (dim != 3) { + return 1.25; + } + if (density >= 16 && epsilon <= 1.0e-3) { + return 2.0; + } + if (density >= 8 && epsilon <= 1.0e-4) { + return 2.0; + } + if (density >= 1 && epsilon <= 1.0e-5) { + return 2.0; + } + + } // end nufftType == 1 + if (nufftType == 2) { + if (dim == 1 && density >= 16 && epsilon <= 1.0e-5) { + return 2.0; + } + // if we got here and it is not 3D return 1.25 + if (dim != 3) { + return 1.25; + } + // 3D cases only here + if (density >= 8 && epsilon <= 1.0e-3) { + return 2.0; + } + if (density >= 2 && epsilon <= 1.0e-4) { + return 2.0; + } + if (density >= 0.5 && epsilon <= 1.0e-5) { + return 2.0; + } + if (density >= 0.25 && epsilon <= 1.0e-6) { + return 2.0; + } + } // end nufftType == 2 + } else { // end single precision, double precision follows + if (nufftType == 1) { + // interestingly for type 1, 1.25 is always faster in 1D + if (dim == 2) { + if (density >= 8 && epsilon <= 1.0e-7) { + return 2.0; + } + } // end dim == 2 + if (dim == 3) { + if (density >= 16 && epsilon <= 1.0e-3) { + return 2.0; + } + if (density >= 8 && epsilon <= 1.0e-4) { + return 2.0; + } + if (density >= 2 && epsilon <= 1.0e-5) { + return 2.0; + } + if (density >= 1 && epsilon <= 1.0e-6) { + return 2.0; + } + if (density >= 0.5 && epsilon <= 1.0e-7) { + return 2.0; + } + if (density >= 0.25 && epsilon <= 1.0e-8) { + return 2.0; + } + if (density >= 0.125 && epsilon <= 1.0e-9) { + return 2.0; + } + } // end dim == 3 + } // end nufftType == 1 + if (nufftType == 2) { + if (dim == 1) { + return 1.25; + } // end dim == 1 + if (dim == 2) { + if (density >= 16 && epsilon <= 1.0e-7) { + return 2.0; + } + } // end dim == 2 + if (dim == 3) { + if (density >= 8 && epsilon <= 1.0e-3) { + return 2.0; + } + if (density >= 2 && epsilon <= 1.0e-4) { + return 2.0; + } + if (density >= 1 && epsilon <= 1.0e-5) { + return 2.0; + } + if (density >= 0.5 && epsilon <= 1.0e-6) { + return 2.0; + } + if (density >= 0.25 && epsilon <= 1.0e-8) { + return 2.0; + } + if (density >= 0.125 && epsilon <= 1.0e-9) { + return 2.0; + } + } // end dim == 3 + } // end nufftType == 2 + } + return 1.25; +} + +template +static double bestUpsamplingFactorMultithread(const double density, const int dim, + const int nufftType, const double epsilon) { + constexpr bool isFloat = std::is_same_v; + if constexpr (isFloat) { + if (dim == 3 && nufftType == 1 && epsilon <= 1.0e-5 && density >= 16) { + return 2.0; + } + if (dim == 3 && nufftType == 2 && epsilon <= 1.0e-5 && density >= 8) { + return 2.0; + } + + } else { + // only case where 2.0 is faster in 2D + if (dim == 2 && nufftType == 1 && density >= 32 && epsilon <= 1.0e-8) { + return 2.0; + } + // the following are only 3D cases if it reaches here and is not 3D return 1.25 + if (dim != 3) { + return 1.25; + } + if (nufftType == 1) { // same as fftw here + if (density >= 16 && epsilon <= 1.0e-5) { + return 2.0; + } + if (density >= 8 && epsilon <= 1.0e-7) { + return 2.0; + } + if (density >= 2 && epsilon <= 1.0e-8) { + return 2.0; + } + if (density >= 1 && epsilon <= 1.0e-9) { + return 2.0; + } + } + if (nufftType == 2) { // some different cases here + if (density >= 8 && epsilon <= 1.0e-5) { + return 2.0; + } + if (density >= 4 && epsilon <= 1.0e-7) { + return 2.0; + } + if (density >= 1 && epsilon <= 1.0e-8) { + return 2.0; + } + } + } + + return 1.25; +} + +#endif +template +double bestUpsamplingFactor(const int nthreads, const double density, const int dim, + const int nufftType, const double epsilon) { + // 1) For epsilons <= 1e-9, 1.25 is not supported + if (epsilon <= 1.0e-9) { + return 2.0; + } + + // 2) Special-case for nufftType == 3 + // TODO: maybe use the bandwidth here? + if (nufftType == 3) { + return 1.25; + } + // 3) For 1 thread, use single-threaded heuristic + if (nthreads == 1) { + return bestUpsamplingFactorSinglethread(density, dim, nufftType, epsilon); + } + // 4) For 2 threads, use multi-threaded heuristic + if (nthreads > 1) { + return bestUpsamplingFactorMultithread(density, dim, nufftType, epsilon); + } + + // 4) Otherwise, return 1.25 + return 1.25; +} + +} // end namespace finufft::heuristics + +#endif // HEURISTICS_HPP diff --git a/src/finufft_core.cpp b/src/finufft_core.cpp index fc36f1f03..6342c7856 100644 --- a/src/finufft_core.cpp +++ b/src/finufft_core.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include #include "../contrib/legendre_rule_fast.h" @@ -14,6 +15,7 @@ using namespace finufft; using namespace finufft::utils; using namespace finufft::spreadinterp; using namespace finufft::quadrature; +using namespace finufft::heuristics; /* Computational core for FINUFFT. @@ -268,7 +270,7 @@ template class KernelFseries { FINUFFT_ALWAYS_INLINE T operator()(T k) { T x = 0; for (size_t n = 0; n < z.size(); ++n) - x += f[n] * 2 * cos(k * z[n]); // pos & neg freq pair. use T cos! + x += f[n] * 2 * std::cos(k * z[n]); // pos & neg freq pair. use T cos! return x; } }; @@ -632,18 +634,14 @@ FINUFFT_PLAN_T::FINUFFT_PLAN_T(int type_, int dim_, const BIGINT *n_modes, i } // heuristic to choose default upsampfac... (currently two poss) - if (opts.upsampfac == 0.0) { // indicates auto-choose - opts.upsampfac = 2.0; // default, and need for tol small - if (tol >= (TF)1E-9) { // the tol sigma=5/4 can reach - if (type == 3) // could move to setpts, more known? - opts.upsampfac = 1.25; // faster b/c smaller RAM & FFT - else if ((dim == 1 && N() > 10000000) || (dim == 2 && N() > 300000) || - (dim == 3 && N() > 3000000)) // type 1,2 heuristic cutoffs, double, - // typ tol, 12-core xeon - opts.upsampfac = 1.25; - } + if (opts.upsampfac == 0.0) { // indicates auto-choose + const auto density = double(nj) / double(N() > 0 ? N() : 1); // dumbinputs allows + // N()==0 + opts.upsampfac = bestUpsamplingFactor(opts.nthreads, density, dim, type, tol); if (opts.debug > 1) - printf("[%s] set auto upsampfac=%.2f\n", __func__, opts.upsampfac); + printf("[%s] threads %d, density %.3g, dim %d, nufft type %d, tol %.3g: auto " + "upsampfac=%.2f\n", + __func__, opts.nthreads, density, dim, type, tol, opts.upsampfac); } // use opts to choose and write into plan's spread options... ier = setup_spreader_for_nufft(spopts, tol, opts, dim); From c6860a8d81bfd79bc63a7a2eb289e4e89ddfed7f Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Thu, 27 Feb 2025 19:42:59 -0500 Subject: [PATCH 5/5] Update CMakeLists.txt --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8d21cfc3f..a7213643a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -287,8 +287,8 @@ if(FINUFFT_USE_CUDA) "FINUFFT WARNING: No CUDA architecture supplied via '-DCMAKE_CUDA_ARCHITECTURES=...', defaulting to 'native'" ) message(WARNING "See: https://developer.nvidia.com/cuda-gpus for more details on what architecture to supply.") + set(CMAKE_CUDA_ARCHITECTURES "native") endif() - set(CMAKE_CUDA_ARCHITECTURES "native") enable_language(CUDA) find_package(CUDAToolkit REQUIRED) add_subdirectory(src/cuda)