From b1e484fb294b2759d3b6b1f68ca0bf5e255b87d1 Mon Sep 17 00:00:00 2001
From: Marco Barbone <mbarbone@flatironinstitute.org>
Date: Tue, 25 Feb 2025 16:39:24 -0500
Subject: [PATCH 1/5] :reformatting

---
 .clang-format                                 |   8 +
 .gersemirc                                    |   3 +
 .git-blame-ignore-revs                        |   2 +-
 .gitattributes                                |   2 +-
 .pre-commit-config.yaml                       |  10 +-
 .readthedocs.yaml                             |   2 +-
 CMakeLists.txt                                | 440 ++++++++----------
 cmake/setupCPM.cmake                          |  20 +-
 cmake/setupDUCC.cmake                         |  87 ++--
 cmake/setupFFTW.cmake                         | 164 +++----
 cmake/setupSphinx.cmake                       |  28 +-
 cmake/setupXSIMD.cmake                        |  56 +--
 cmake/utils.cmake                             | 144 +++---
 contributing.md                               |   2 +-
 devel/CMakeLists.txt                          |  23 +-
 devel/GuruInterfaceBranch_BrainDump           |  38 +-
 devel/agenda_11-27-23.txt                     |   5 +-
 devel/check_dumbinputs.cmake                  |  66 +--
 .../compare_foldrescale_PR440_laptop5700U.txt |   7 +-
 .../cuda/cufinufft_tasks_meeting_Jun2023.txt  |   5 -
 devel/eval_ker_expts.cpp                      |   2 +-
 devel/finufft_meeting-7-5-23.txt              |   1 -
 devel/foldrescale_spreadtest1d_slower.txt     |   1 -
 devel/gen_all_horner_cpp_header.m             |   8 +-
 devel/ker_ppval_coeff_mat.m                   |   3 +-
 devel/padding.cpp                             |   2 +-
 devel/plans_fall23.txt                        |   1 -
 devel/reverse_engineer_tol.m                  |   4 +-
 devel/test_ker_ppval.cpp                      |   6 +-
 devel/v2spec.md                               |   4 +-
 docs/Makefile                                 |   2 +-
 docs/c.rst                                    |   7 +-
 docs/c1d.doc                                  |  84 ++--
 docs/c1d.docsrc                               |   4 +-
 docs/c2d.doc                                  |  78 ++--
 docs/c3d.doc                                  |  98 ++--
 docs/c3d.docsrc                               |   2 +-
 docs/cguru.doc                                |  68 +--
 docs/cguru.docsrc                             |   4 +-
 docs/install_gpu.rst                          |   4 +-
 docs/latexindex.rst                           |  10 +-
 docs/makefile.doc                             |   4 +-
 docs/math.rst                                 |  12 +-
 docs/matlab.rst                               |   6 +-
 docs/nfft_migr.rst                            |   2 +-
 docs/refs.rst                                 |   1 -
 docs/tut.rst                                  |   5 +-
 docs/tutorial/contft.rst                      |  15 +-
 docs/tutorial/grf.rst                         |   4 +-
 docs/tutorial/peripois2d.rst                  |  20 +-
 docs/tutorial/serieseval.rst                  |  19 +-
 examples/CMakeLists.txt                       |  47 +-
 examples/cuda/CMakeLists.txt                  |  12 +-
 fortran/CMakeLists.txt                        |  31 +-
 fortran/cmcl_license.txt                      |   8 +-
 fortran/directft/README                       |   2 +-
 fortran/directft/dirft1d.f                    |  14 +-
 fortran/directft/dirft1df.f                   |  12 +-
 fortran/directft/dirft2d.f                    |   8 +-
 fortran/directft/dirft2df.f                   |   8 +-
 fortran/directft/dirft3d.f                    |  13 +-
 fortran/directft/dirft3df.f                   |  13 +-
 fortran/directft/prini.f                      |   2 +-
 fortran/examples/guru1d1.f                    |  12 +-
 fortran/examples/guru1d1f.f                   |   8 +-
 fortran/examples/nufft1d_demo.f               |   2 +-
 fortran/examples/nufft2d_demo.f               |   4 +-
 fortran/examples/nufft2d_demof.f              |   4 +-
 fortran/examples/nufft2dmany_demo.f           |   8 +-
 fortran/examples/nufft2dmany_demof.f          |   8 +-
 fortran/examples/nufft3d_demo.f               |   4 +-
 fortran/examples/nufft3d_demof.f              |   6 +-
 fortran/examples/simple1d1.f                  |  10 +-
 fortran/examples/simple1d1.f90                |  20 +-
 fortran/examples/simple1d1f.f                 |  10 +-
 include/cufinufft/contrib/helper_cuda.h       |   2 +-
 include/cufinufft/impl.h                      |  66 +--
 include/cufinufft/spreadinterp.h              |   2 +-
 include/cufinufft/types.h                     |   6 +-
 matlab/CMakeLists.txt                         |   7 +-
 matlab/finufft1d1.m                           |   1 -
 matlab/finufft1d2.m                           |   1 -
 matlab/finufft1d3.m                           |   1 -
 matlab/finufft2d1.m                           |   1 -
 matlab/finufft2d2.m                           |   1 -
 matlab/finufft2d3.m                           |   1 -
 matlab/finufft3d1.m                           |   1 -
 matlab/finufft3d2.m                           |   1 -
 matlab/finufft3d3.m                           |   2 -
 matlab/finufft_plan.docsrc                    |   3 +-
 matlab/finufft_plan.m                         |   1 -
 matlab/notes.docbit                           |   1 -
 matlab/test/check_modeords.m                  |   2 +-
 matlab/test/fig_accuracy.m                    |   1 -
 matlab/test/guru_setpts_issue.m               |   2 +-
 matlab/valid_setpts.m                         |   4 +-
 perftest/CMakeLists.txt                       |  32 +-
 perftest/checkGuruTiming.sh                   |   4 +-
 perftest/compare_spreads.jl                   |   6 +-
 perftest/cuda/CMakeLists.txt                  |  15 +-
 perftest/cuda/cuperftest.cu                   |   4 +-
 perftest/mymaxthreads.sh                      |   2 +-
 perftest/mynumcores.sh                        |   2 +-
 perftest/results/gcc_vs_icc_xeon.txt          |   3 +-
 perftest/searchForTimeMetrics.py              |   6 +-
 perftest/spreadbenchmark.py                   |  22 +-
 perftest/spreadingSchemeStats.py              |  20 +-
 perftest/spreadtestnd.sh                      |   2 +-
 perftest/timingBreakdowns.py                  |  70 ++-
 .../timingBreakdowns_largeProblems.out        |   2 +-
 ...eakdowns_smallProblems_SequentialMulti.out |   3 +-
 ...smallProblems_SequentialMulti_noSwitch.out |   2 +-
 ...downs_smallProblems_SimultaneousSingle.out |   3 +-
 python/CMakeLists.txt                         |  24 +-
 python/finufft/test/accuracy_speed_tests.py   |   2 +-
 src/cuda/1d/README                            |  12 +-
 src/cuda/2d/README                            |  12 +-
 src/cuda/3d/README                            |  14 +-
 src/cuda/CMakeLists.txt                       |  56 ++-
 src/cuda/README                               |   2 +-
 src/cuda/memtransfer_wrapper.cu               |   7 +-
 test/CMakeLists.txt                           |  45 +-
 test/cuda/CMakeLists.txt                      | 129 ++---
 test/spreadinterp1d_test.cpp                  |   2 +-
 tutorial/README                               |   1 -
 tutorial/inv1d2.m                             |   2 +-
 tutorial/serieseval1d.m                       |   2 +-
 tutorial/utils/lgwt.m                         |  46 +-
 128 files changed, 1190 insertions(+), 1330 deletions(-)
 create mode 100644 .gersemirc

diff --git a/.clang-format b/.clang-format
index 0488353ca..e71296a29 100644
--- a/.clang-format
+++ b/.clang-format
@@ -30,4 +30,12 @@ SpaceAfterLogicalNot: false
 SpaceAfterTemplateKeyword: false
 TabWidth: 2
 UseTab: Never
+AttributeMacros: ['__host__', '__device__', '__global__', '__forceinline__']
+QualifierOrder:
+  - static
+  - inline
+  - constexpr
+  - const
+  - type
+QualifierAlignment: Custom
 ...
diff --git a/.gersemirc b/.gersemirc
new file mode 100644
index 000000000..d422a5f81
--- /dev/null
+++ b/.gersemirc
@@ -0,0 +1,3 @@
+definitions:
+  - "./cmake"
+line_length: 120
diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs
index 1e469ec95..72361d1e3 100644
--- a/.git-blame-ignore-revs
+++ b/.git-blame-ignore-revs
@@ -1,2 +1,2 @@
 # Applied clang format to the codebase
-884ba427be0c60aa3399d5ea71b0e9e3a7cbf686
\ No newline at end of file
+884ba427be0c60aa3399d5ea71b0e9e3a7cbf686
diff --git a/.gitattributes b/.gitattributes
index dcd444fbe..d11d273f7 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -1,2 +1,2 @@
 # Convert to LF line endings on checkout.
-*.sh text eol=lf
\ No newline at end of file
+*.sh text eol=lf
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index f935d780a..a2c1f8de3 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -4,6 +4,7 @@ repos:
     hooks:
       - id: clang-format
         types_or: [c++, c, cuda]
+        exclude: '(^|/)(matlab/.*)$'
   - repo: https://github.com/pre-commit/pre-commit-hooks
     rev: v5.0.0
     hooks:
@@ -13,6 +14,13 @@ repos:
       - id: check-illegal-windows-names
       - id: mixed-line-ending
   - repo: https://github.com/BlankSpruce/gersemi
-    rev: 0.18.2
+    rev: 0.19.1
     hooks:
       - id: gersemi
+  - repo: https://github.com/abravalheri/validate-pyproject
+    rev: v0.23  # Use the latest stable version
+    hooks:
+      - id: validate-pyproject
+        # Optional: Include additional validations from SchemaStore
+        additional_dependencies: ["validate-pyproject-schema-store[all]"]
+        files: ^python/(finufft|cufinufft)/pyproject\.toml$
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
index 0f63d4f34..5f990bbf8 100644
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -10,7 +10,7 @@ build:
   os: ubuntu-22.04
   tools:
     python: "3.11"
-    
+
 # Build all formats
 formats: all
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2c6284195..6a9d93d93 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,15 +1,12 @@
 cmake_minimum_required(VERSION 3.19...3.30)
-project(
-  FINUFFT
-  VERSION 2.4.0
-  LANGUAGES C CXX)
+project(FINUFFT VERSION 2.4.0 LANGUAGES C CXX)
 
 # windows MSVC runtime flags policy
 cmake_policy(SET CMP0091 NEW)
 
 include(CMakeDependentOption)
 
-# cmake-format: off
+# gersemi: off
 # All options go here sphinx tag (don't remove): @cmake_opts_start
 option(FINUFFT_BUILD_FORTRAN "Whether to build the FINUFFT Fortran examples" OFF)
 option(FINUFFT_BUILD_MATLAB "Whether to build the FINUFFT Matlab interface" OFF)
@@ -36,11 +33,11 @@ set(FINUFFT_ARCH_FLAGS "native" CACHE STRING "Compiler flags for specifying targ
 cmake_dependent_option(FINUFFT_ENABLE_INSTALL "Disable installation in the case of python builds" ON "NOT FINUFFT_BUILD_PYTHON" OFF)
 cmake_dependent_option(FINUFFT_STATIC_LINKING "Disable static libraries in the case of python builds" ON "NOT FINUFFT_BUILD_PYTHON" OFF)
 cmake_dependent_option(FINUFFT_SHARED_LINKING "Shared should be the opposite of static linking" ON "NOT FINUFFT_STATIC_LINKING" OFF)
-# cmake-format: on
+# gersemi: on
 
 # When building shared libraries, we need to build with -fPIC in all cases
 if(FINUFFT_SHARED_LINKING)
-  set(FINUFFT_POSITION_INDEPENDENT_CODE ON)
+    set(FINUFFT_POSITION_INDEPENDENT_CODE ON)
 endif()
 
 include(cmake/utils.cmake)
@@ -67,10 +64,10 @@ set(FINUFFT_CXX_FLAGS_RELEASE
     /Ob
     /Oi
     /Ot
-    /Oy)
+    /Oy
+)
 
-filter_supported_compiler_flags(FINUFFT_CXX_FLAGS_RELEASE
-                                FINUFFT_CXX_FLAGS_RELEASE)
+filter_supported_compiler_flags(FINUFFT_CXX_FLAGS_RELEASE FINUFFT_CXX_FLAGS_RELEASE)
 message(STATUS "FINUFFT Release flags: ${FINUFFT_CXX_FLAGS_RELEASE}")
 set(FINUFFT_CXX_FLAGS_RELWITHDEBINFO ${FINUFFT_CXX_FLAGS_RELEASE})
 
@@ -81,58 +78,49 @@ set(FINUFFT_CXX_FLAGS_DEBUG
     -ggdb3
     -Wall
     -Wno-sign-compare
-    -Wno-unknown-pragmas)
+    -Wno-unknown-pragmas
+)
 
 if(DEFINED ENV{GITHUB_ACTIONS})
-  if($ENV{GITHUB_ACTIONS} STREQUAL "true")
-    message("CMake is being executed inside a GitHub workflow")
-    # if msvc use FS flag
-    if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC" AND NOT CMAKE_BUILD_TYPE STREQUAL
-                                                 "Release")
-      set(CMAKE_MSVC_DEBUG_INFORMATION_FORMAT
-          "$<$<CONFIG:Debug,RelWithDebInfo>:Embedded>")
-      message("CMAKE_MSVC_DEBUG_INFORMATION_FORMAT TO Embedded")
+    if($ENV{GITHUB_ACTIONS} STREQUAL "true")
+        message("CMake is being executed inside a GitHub workflow")
+        # if msvc use FS flag
+        if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC" AND NOT CMAKE_BUILD_TYPE STREQUAL "Release")
+            set(CMAKE_MSVC_DEBUG_INFORMATION_FORMAT "$<$<CONFIG:Debug,RelWithDebInfo>:Embedded>")
+            message("CMAKE_MSVC_DEBUG_INFORMATION_FORMAT TO Embedded")
+        endif()
     endif()
-  endif()
 else()
-  message("CMake is NOT being executed inside a GitHub workflow")
-  # env variable is:
-  message(STATUS "ENV{GITHUB_ACTIONS}: $ENV{GITHUB_ACTIONS}")
+    message("CMake is NOT being executed inside a GitHub workflow")
+    # env variable is:
+    message(STATUS "ENV{GITHUB_ACTIONS}: $ENV{GITHUB_ACTIONS}")
 endif()
 filter_supported_compiler_flags(FINUFFT_CXX_FLAGS_DEBUG FINUFFT_CXX_FLAGS_DEBUG)
 message(STATUS "FINUFFT Debug flags: ${FINUFFT_CXX_FLAGS_DEBUG}")
-list(APPEND FINUFFT_CXX_FLAGS_RELWITHDEBINFO ${FINUFFT_CXX_FLAGS_RELEASE}
-     ${FINUFFT_CXX_FLAGS_DEBUG})
-message(
-  STATUS "FINUFFT RelWithDebInfo flags: ${FINUFFT_CXX_FLAGS_RELWITHDEBINFO}")
+list(APPEND FINUFFT_CXX_FLAGS_RELWITHDEBINFO ${FINUFFT_CXX_FLAGS_RELEASE} ${FINUFFT_CXX_FLAGS_DEBUG})
+message(STATUS "FINUFFT RelWithDebInfo flags: ${FINUFFT_CXX_FLAGS_RELWITHDEBINFO}")
 
 if(FINUFFT_ARCH_FLAGS STREQUAL "native")
-  set(FINUFFT_ARCH_FLAGS
-      -march=native
-      CACHE STRING "" FORCE)
-  filter_supported_compiler_flags(FINUFFT_ARCH_FLAGS FINUFFT_ARCH_FLAGS)
-  if(NOT FINUFFT_ARCH_FLAGS)
-    set(FINUFFT_ARCH_FLAGS
-        -mtune=native
-        CACHE STRING "" FORCE)
+    set(FINUFFT_ARCH_FLAGS -march=native CACHE STRING "" FORCE)
     filter_supported_compiler_flags(FINUFFT_ARCH_FLAGS FINUFFT_ARCH_FLAGS)
-  endif()
-  if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
-    # -march=native emulation for MSVC
-    check_arch_support()
-  endif()
-  if(NOT FINUFFT_ARCH_FLAGS)
-    message(WARNING "No architecture flags are supported by the compiler.")
-  else()
-    message(STATUS "FINUFFT Arch flags: ${FINUFFT_ARCH_FLAGS}")
-  endif()
+    if(NOT FINUFFT_ARCH_FLAGS)
+        set(FINUFFT_ARCH_FLAGS -mtune=native CACHE STRING "" FORCE)
+        filter_supported_compiler_flags(FINUFFT_ARCH_FLAGS FINUFFT_ARCH_FLAGS)
+    endif()
+    if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
+        # -march=native emulation for MSVC
+        check_arch_support()
+    endif()
+    if(NOT FINUFFT_ARCH_FLAGS)
+        message(WARNING "No architecture flags are supported by the compiler.")
+    else()
+        message(STATUS "FINUFFT Arch flags: ${FINUFFT_ARCH_FLAGS}")
+    endif()
 endif()
 
 # Set default build type to Release
 if(NOT CMAKE_BUILD_TYPE)
-  set(CMAKE_BUILD_TYPE
-      Release
-      CACHE STRING "Set the default build type to Release" FORCE)
+    set(CMAKE_BUILD_TYPE Release CACHE STRING "Set the default build type to Release" FORCE)
 endif()
 
 # This set of sources is compiled twice, once in single precision and once in
@@ -142,223 +130,206 @@ set(FINUFFT_PRECISION_DEPENDENT_SOURCES)
 # If we're building for Fortran, make sure we also include the translation
 # layer.
 if(FINUFFT_BUILD_FORTRAN)
-  list(APPEND FINUFFT_PRECISION_DEPENDENT_SOURCES fortran/finufftfort.cpp)
+    list(APPEND FINUFFT_PRECISION_DEPENDENT_SOURCES fortran/finufftfort.cpp)
 endif()
 
 # set linker flags for sanitizer
 set(FINUFFT_SANITIZER_FLAGS)
 if(FINUFFT_ENABLE_SANITIZERS)
-  set(FINUFFT_SANITIZER_FLAGS -fsanitize=address -fsanitize=undefined
-                              -fsanitize=bounds-strict /fsanitize=address /RTC1)
-  filter_supported_compiler_flags(FINUFFT_SANITIZER_FLAGS
-                                  FINUFFT_SANITIZER_FLAGS)
-  set(FINUFFT_SANITIZER_FLAGS
-      $<$<CONFIG:Debug,RelWithDebInfo>:${FINUFFT_SANITIZER_FLAGS}>)
+    set(FINUFFT_SANITIZER_FLAGS
+        -fsanitize=address
+        -fsanitize=undefined
+        -fsanitize=bounds-strict
+        /fsanitize=address
+        /RTC1
+    )
+    filter_supported_compiler_flags(FINUFFT_SANITIZER_FLAGS FINUFFT_SANITIZER_FLAGS)
+    set(FINUFFT_SANITIZER_FLAGS $<$<CONFIG:Debug,RelWithDebInfo>:${FINUFFT_SANITIZER_FLAGS}>)
 endif()
 # Utility function to enable ASAN on debug builds
 function(enable_asan target)
-  target_compile_options(${target} PRIVATE ${FINUFFT_SANITIZER_FLAGS})
-  if(NOT (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC"))
-    target_link_options(${target} PRIVATE ${FINUFFT_SANITIZER_FLAGS})
-  endif()
+    target_compile_options(${target} PRIVATE ${FINUFFT_SANITIZER_FLAGS})
+    if(NOT (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC"))
+        target_link_options(${target} PRIVATE ${FINUFFT_SANITIZER_FLAGS})
+    endif()
 endfunction()
 
 set(CPM_DOWNLOAD_VERSION 0.40.5)
 include(cmake/setupCPM.cmake)
 
 if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME)
-  include(CTest)
-  if(FINUFFT_BUILD_TESTS)
-    enable_testing()
-  endif()
-  if(FINUFFT_BUILD_DOCS)
-    include(cmake/setupSphinx.cmake)
-  endif()
+    include(CTest)
+    if(FINUFFT_BUILD_TESTS)
+        enable_testing()
+    endif()
+    if(FINUFFT_BUILD_DOCS)
+        include(cmake/setupSphinx.cmake)
+    endif()
 endif()
 
 if(FINUFFT_USE_CPU)
-  # make apple with gnu use old linker, new linker breaks, see issue #360
-  if((APPLE) AND (CMAKE_CXX_COMPILER_ID STREQUAL "GNU"))
-    add_link_options("-ld_classic")
-  endif()
-  set(FFTW_VERSION 3.3.10)
-  set(XTL_VERSION 0.7.7)
-  set(XSIMD_VERSION 13.1.0)
-  set(DUCC0_VERSION ducc0_0_36_0)
-  set(FINUFFT_FFTW_LIBRARIES)
-  include(cmake/setupXSIMD.cmake)
-  if(FINUFFT_USE_DUCC0)
-    include(cmake/setupDUCC.cmake)
-  else()
-    include(cmake/setupFFTW.cmake)
-  endif()
-  if(FINUFFT_USE_DUCC0)
-    set(FINUFFT_FFTLIBS ducc0)
-  else()
-    set(FINUFFT_FFTLIBS ${FINUFFT_FFTW_LIBRARIES})
-  endif()
-  if(FINUFFT_USE_OPENMP)
-    find_package(
-      OpenMP
-      COMPONENTS C CXX
-      REQUIRED)
-  endif()
+    # make apple with gnu use old linker, new linker breaks, see issue #360
+    if((APPLE) AND (CMAKE_CXX_COMPILER_ID STREQUAL "GNU"))
+        add_link_options("-ld_classic")
+    endif()
+    set(FFTW_VERSION 3.3.10)
+    set(XTL_VERSION 0.7.7)
+    set(XSIMD_VERSION 13.1.0)
+    set(DUCC0_VERSION ducc0_0_36_0)
+    set(FINUFFT_FFTW_LIBRARIES)
+    include(cmake/setupXSIMD.cmake)
+    if(FINUFFT_USE_DUCC0)
+        include(cmake/setupDUCC.cmake)
+    else()
+        include(cmake/setupFFTW.cmake)
+    endif()
+    if(FINUFFT_USE_DUCC0)
+        set(FINUFFT_FFTLIBS ducc0)
+    else()
+        set(FINUFFT_FFTLIBS ${FINUFFT_FFTW_LIBRARIES})
+    endif()
+    if(FINUFFT_USE_OPENMP)
+        find_package(OpenMP COMPONENTS C CXX REQUIRED)
+    endif()
 endif()
 
 # check if -Wno-deprecated-declarations is supported
-check_cxx_compiler_flag(-Wno-deprecated-declarations
-                        FINUFFT_HAS_NO_DEPRECATED_DECLARATIONS)
+check_cxx_compiler_flag(-Wno-deprecated-declarations FINUFFT_HAS_NO_DEPRECATED_DECLARATIONS)
 
 # Utility function to link static/dynamic lib
 function(finufft_link_test target)
-  if(FINUFFT_USE_DUCC0)
-    target_compile_definitions(${target} PRIVATE FINUFFT_USE_DUCC0)
-  endif()
-  target_link_libraries(${target} PRIVATE finufft xsimd ${FINUFFT_FFTLIBS})
-  if(FINUFFT_USE_OPENMP)
-    target_link_libraries(${target} PRIVATE OpenMP::OpenMP_CXX)
-    target_link_options(${target} PRIVATE ${OpenMP_CXX_FLAGS})
-  endif()
-  enable_asan(${target})
-  target_compile_features(${target} PRIVATE cxx_std_17)
-  set_target_properties(
-    ${target} PROPERTIES MSVC_RUNTIME_LIBRARY
-                         "MultiThreaded$<$<CONFIG:Debug>:Debug>")
-  # disable deprecated warnings for tests if supported
-  if(FINUFFT_HAS_NO_DEPRECATED_DECLARATIONS)
-    target_compile_options(${target} PRIVATE -Wno-deprecated-declarations)
-  endif()
+    if(FINUFFT_USE_DUCC0)
+        target_compile_definitions(${target} PRIVATE FINUFFT_USE_DUCC0)
+    endif()
+    target_link_libraries(${target} PRIVATE finufft xsimd ${FINUFFT_FFTLIBS})
+    if(FINUFFT_USE_OPENMP)
+        target_link_libraries(${target} PRIVATE OpenMP::OpenMP_CXX)
+        target_link_options(${target} PRIVATE ${OpenMP_CXX_FLAGS})
+    endif()
+    enable_asan(${target})
+    target_compile_features(${target} PRIVATE cxx_std_17)
+    set_target_properties(${target} PROPERTIES MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>")
+    # disable deprecated warnings for tests if supported
+    if(FINUFFT_HAS_NO_DEPRECATED_DECLARATIONS)
+        target_compile_options(${target} PRIVATE -Wno-deprecated-declarations)
+    endif()
 endfunction()
 
 # Utility function to set finufft compilation options.
 function(set_finufft_options target)
-  target_compile_features(${target} PRIVATE cxx_std_17)
-  target_compile_options(
-    ${target} PRIVATE $<$<CONFIG:Release,RelWithDebInfo>:${FINUFFT_ARCH_FLAGS}>)
-  target_compile_options(
-    ${target} PRIVATE $<$<CONFIG:Release>:${FINUFFT_CXX_FLAGS_RELEASE}>)
-  target_compile_options(
-    ${target}
-    PRIVATE $<$<CONFIG:RelWithDebInfo>:${FINUFFT_CXX_FLAGS_RELWITHDEBINFO}>)
-  target_compile_options(${target}
-                         PRIVATE $<$<CONFIG:Debug>:${FINUFFT_CXX_FLAGS_DEBUG}>)
-  target_include_directories(
-    ${target} PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>)
-  target_include_directories(
-    ${target} SYSTEM
-    INTERFACE $<INSTALL_INTERFACE:${CMAKE_INSTALL_PREFIX}/include>)
-  set_target_properties(
-    ${target}
-    PROPERTIES MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>"
-               POSITION_INDEPENDENT_CODE ${FINUFFT_POSITION_INDEPENDENT_CODE})
-  enable_asan(${target})
-  if(FINUFFT_USE_OPENMP)
-    target_link_libraries(${target} PRIVATE OpenMP::OpenMP_CXX)
-    target_link_options(${target} PRIVATE ${OpenMP_CXX_FLAGS})
-  endif()
-  if(FINUFFT_USE_DUCC0)
-    target_compile_definitions(${target} PRIVATE FINUFFT_USE_DUCC0)
-  endif()
-  target_link_libraries(${target} PRIVATE xsimd)
-  target_link_libraries(${target} PRIVATE ${FINUFFT_FFTLIBS})
+    target_compile_features(${target} PRIVATE cxx_std_17)
+    target_compile_options(${target} PRIVATE $<$<CONFIG:Release,RelWithDebInfo>:${FINUFFT_ARCH_FLAGS}>)
+    target_compile_options(${target} PRIVATE $<$<CONFIG:Release>:${FINUFFT_CXX_FLAGS_RELEASE}>)
+    target_compile_options(${target} PRIVATE $<$<CONFIG:RelWithDebInfo>:${FINUFFT_CXX_FLAGS_RELWITHDEBINFO}>)
+    target_compile_options(${target} PRIVATE $<$<CONFIG:Debug>:${FINUFFT_CXX_FLAGS_DEBUG}>)
+    target_include_directories(${target} PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>)
+    target_include_directories(${target} SYSTEM INTERFACE $<INSTALL_INTERFACE:${CMAKE_INSTALL_PREFIX}/include>)
+    set_target_properties(
+        ${target}
+        PROPERTIES
+            MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>"
+            POSITION_INDEPENDENT_CODE ${FINUFFT_POSITION_INDEPENDENT_CODE}
+    )
+    enable_asan(${target})
+    if(FINUFFT_USE_OPENMP)
+        target_link_libraries(${target} PRIVATE OpenMP::OpenMP_CXX)
+        target_link_options(${target} PRIVATE ${OpenMP_CXX_FLAGS})
+    endif()
+    if(FINUFFT_USE_DUCC0)
+        target_compile_definitions(${target} PRIVATE FINUFFT_USE_DUCC0)
+    endif()
+    target_link_libraries(${target} PRIVATE xsimd)
+    target_link_libraries(${target} PRIVATE ${FINUFFT_FFTLIBS})
 endfunction()
 
 if(FINUFFT_USE_CPU)
-  set(FINUFFT_SOURCES
-      src/spreadinterp.cpp
-      contrib/legendre_rule_fast.cpp
-      src/fft.cpp
-      src/finufft_core.cpp
-      src/c_interface.cpp
-      src/finufft_utils.cpp
-      fortran/finufftfort.cpp)
-  # Main finufft libraries
-  if(NOT FINUFFT_STATIC_LINKING)
-    add_library(finufft SHARED ${FINUFFT_SOURCES})
-  else()
-    add_library(finufft STATIC ${FINUFFT_SOURCES})
-  endif()
-  set_finufft_options(finufft)
-
-  if(WIN32 AND FINUFFT_SHARED_LINKING)
-    target_compile_definitions(finufft PRIVATE dll_EXPORTS FINUFFT_DLL)
-  endif()
-  find_library(MATH_LIBRARY m)
-  if(MATH_LIBRARY)
-    target_link_libraries(finufft PRIVATE ${MATH_LIBRARY})
-  endif()
-  target_include_directories(
-    finufft PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>)
-  target_include_directories(
-    finufft SYSTEM
-    INTERFACE $<INSTALL_INTERFACE:${CMAKE_INSTALL_PREFIX}/include>)
-  if(FINUFFT_ENABLE_INSTALL)
-    file(GLOB FINUFFT_PUBLIC_HEADERS
-         "${CMAKE_CURRENT_SOURCE_DIR}/include/finufft*.h")
-    set_target_properties(finufft PROPERTIES PUBLIC_HEADER
-                                             "${FINUFFT_PUBLIC_HEADERS}")
-  endif()
-  list(APPEND INSTALL_TARGETS finufft)
+    set(FINUFFT_SOURCES
+        src/spreadinterp.cpp
+        contrib/legendre_rule_fast.cpp
+        src/fft.cpp
+        src/finufft_core.cpp
+        src/c_interface.cpp
+        src/finufft_utils.cpp
+        fortran/finufftfort.cpp
+    )
+    # Main finufft libraries
+    if(NOT FINUFFT_STATIC_LINKING)
+        add_library(finufft SHARED ${FINUFFT_SOURCES})
+    else()
+        add_library(finufft STATIC ${FINUFFT_SOURCES})
+    endif()
+    set_finufft_options(finufft)
+
+    if(WIN32 AND FINUFFT_SHARED_LINKING)
+        target_compile_definitions(finufft PRIVATE dll_EXPORTS FINUFFT_DLL)
+    endif()
+    find_library(MATH_LIBRARY m)
+    if(MATH_LIBRARY)
+        target_link_libraries(finufft PRIVATE ${MATH_LIBRARY})
+    endif()
+    target_include_directories(finufft PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>)
+    target_include_directories(finufft SYSTEM INTERFACE $<INSTALL_INTERFACE:${CMAKE_INSTALL_PREFIX}/include>)
+    if(FINUFFT_ENABLE_INSTALL)
+        file(GLOB FINUFFT_PUBLIC_HEADERS "${CMAKE_CURRENT_SOURCE_DIR}/include/finufft*.h")
+        set_target_properties(finufft PROPERTIES PUBLIC_HEADER "${FINUFFT_PUBLIC_HEADERS}")
+    endif()
+    list(APPEND INSTALL_TARGETS finufft)
 endif()
 
 if(FINUFFT_USE_CUDA)
-  if(NOT DEFINED FINUFFT_CUDA_ARCHITECTURES)
-    if(DEFINED CMAKE_CUDA_ARCHITECTURES)
-      set(FINUFFT_CUDA_ARCHITECTURES "{$CMAKE_CUDA_ARCHITECTURES}")
-    else()
-      message(
-        "FINUFFT WARNING: No CUDA architecture supplied via '-DFINUFFT_CUDA_ARCHITECTURES=...', defaulting to 'native'"
-      )
-      message(
-        "See: https://developer.nvidia.com/cuda-gpus for more details on what architecture to supply."
-      )
+    if(NOT DEFINED FINUFFT_CUDA_ARCHITECTURES)
+        if(DEFINED CMAKE_CUDA_ARCHITECTURES)
+            set(FINUFFT_CUDA_ARCHITECTURES "{$CMAKE_CUDA_ARCHITECTURES}")
+        else()
+            message(
+                "FINUFFT WARNING: No CUDA architecture supplied via '-DFINUFFT_CUDA_ARCHITECTURES=...', defaulting to 'native'"
+            )
+            message("See: https://developer.nvidia.com/cuda-gpus for more details on what architecture to supply.")
+        endif()
+    endif()
+    enable_language(CUDA)
+    find_package(CUDAToolkit REQUIRED)
+    add_subdirectory(src/cuda)
+    if(BUILD_TESTING AND FINUFFT_BUILD_TESTS)
+        add_subdirectory(perftest/cuda)
+        add_subdirectory(test/cuda)
     endif()
-  endif()
-  enable_language(CUDA)
-  find_package(CUDAToolkit REQUIRED)
-  add_subdirectory(src/cuda)
-  if(BUILD_TESTING AND FINUFFT_BUILD_TESTS)
-    add_subdirectory(perftest/cuda)
-    add_subdirectory(test/cuda)
-  endif()
-
-  list(APPEND INSTALL_TARGETS cufinufft)
+
+    list(APPEND INSTALL_TARGETS cufinufft)
 endif()
 
 # Add tests defined in their own directory
-if(BUILD_TESTING
-   AND FINUFFT_USE_CPU
-   AND FINUFFT_BUILD_TESTS)
-  add_subdirectory(test)
-  add_subdirectory(perftest)
+if(BUILD_TESTING AND FINUFFT_USE_CPU AND FINUFFT_BUILD_TESTS)
+    add_subdirectory(test)
+    add_subdirectory(perftest)
 endif()
 
 if(FINUFFT_BUILD_EXAMPLES AND FINUFFT_USE_CPU)
-  add_subdirectory(examples)
+    add_subdirectory(examples)
 endif()
 
 if(FINUFFT_BUILD_EXAMPLES AND FINUFFT_USE_CUDA)
-  add_subdirectory(examples/cuda)
+    add_subdirectory(examples/cuda)
 endif()
 
 if(FINUFFT_BUILD_FORTRAN)
-  enable_language(Fortran)
-  add_subdirectory(fortran)
+    enable_language(Fortran)
+    add_subdirectory(fortran)
 endif()
 
 if(FINUFFT_BUILD_MATLAB)
-  add_subdirectory(matlab)
+    add_subdirectory(matlab)
 endif()
 
 if(FINUFFT_BUILD_DEVEL)
-  add_subdirectory(devel)
+    add_subdirectory(devel)
 endif()
 
 if(FINUFFT_BUILD_PYTHON)
-  add_subdirectory(python)
+    add_subdirectory(python)
 endif()
 
-# cmake-format: off
+# gersemi: off
 message(STATUS "FINUFFT configuration summary:")
 message(STATUS "  CMAKE_BUILD_TYPE: ${CMAKE_BUILD_TYPE}")
 message(STATUS "  FINUFFT_USE_CPU: ${FINUFFT_USE_CPU}")
@@ -377,31 +348,34 @@ message(STATUS "  FINUFFT_FFTW_SUFFIX: ${FINUFFT_FFTW_SUFFIX}")
 message(STATUS "  FINUFFT_FFTW_LIBRARIES: ${FINUFFT_FFTW_LIBRARIES}")
 message(STATUS "  FINUFFT_ARCH_FLAGS: ${FINUFFT_ARCH_FLAGS}")
 message(STATUS "  FINUFFT_USE_DUCC0: ${FINUFFT_USE_DUCC0}")
-# cmake-format: on
+# gersemi: on
+
 if(FINUFFT_ENABLE_INSTALL)
-  include(GNUInstallDirs)
-  install(TARGETS ${INSTALL_TARGETS} PUBLIC_HEADER)
-  install(FILES ${PROJECT_SOURCE_DIR}/LICENSE
-          DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/licenses/finufft)
-  if(FINUFFT_USE_CPU)
-    install(
-      DIRECTORY ${PROJECT_SOURCE_DIR}/examples
-      DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/finufft
-      PATTERN "CMakeLists.txt" EXCLUDE
-      PATTERN "README" EXCLUDE
-      PATTERN "examples/cuda" EXCLUDE)
-    if(FINUFFT_BUILD_FORTRAN)
-      install(DIRECTORY ${PROJECT_SOURCE_DIR}/fortran/examples
-              DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/finufft/fortran)
-      install(FILES ${PROJECT_SOURCE_DIR}/include/finufft.fh
-              DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+    include(GNUInstallDirs)
+    install(TARGETS ${INSTALL_TARGETS} PUBLIC_HEADER)
+    install(FILES ${PROJECT_SOURCE_DIR}/LICENSE DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/licenses/finufft)
+    if(FINUFFT_USE_CPU)
+        install(
+            DIRECTORY ${PROJECT_SOURCE_DIR}/examples
+            DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/finufft
+            PATTERN "CMakeLists.txt" EXCLUDE
+            PATTERN "README" EXCLUDE
+            PATTERN "examples/cuda" EXCLUDE
+        )
+        if(FINUFFT_BUILD_FORTRAN)
+            install(
+                DIRECTORY ${PROJECT_SOURCE_DIR}/fortran/examples
+                DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/finufft/fortran
+            )
+            install(FILES ${PROJECT_SOURCE_DIR}/include/finufft.fh DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+        endif()
+    endif()
+    if(FINUFFT_USE_CUDA)
+        install(
+            DIRECTORY ${PROJECT_SOURCE_DIR}/examples/cuda
+            DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/finufft/examples
+            PATTERN "README" EXCLUDE
+            PATTERN "CMakeLists.txt" EXCLUDE
+        )
     endif()
-  endif()
-  if(FINUFFT_USE_CUDA)
-    install(
-      DIRECTORY ${PROJECT_SOURCE_DIR}/examples/cuda
-      DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/finufft/examples
-      PATTERN "README" EXCLUDE
-      PATTERN "CMakeLists.txt" EXCLUDE)
-  endif()
 endif()
diff --git a/cmake/setupCPM.cmake b/cmake/setupCPM.cmake
index 610f1572b..49f696f67 100644
--- a/cmake/setupCPM.cmake
+++ b/cmake/setupCPM.cmake
@@ -1,21 +1,19 @@
 # USING CPM TO HANDLE DEPENDENCIES
 if(CPM_SOURCE_CACHE)
-  set(CPM_DOWNLOAD_LOCATION
-      "${CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake")
+    set(CPM_DOWNLOAD_LOCATION "${CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake")
 elseif(DEFINED ENV{CPM_SOURCE_CACHE})
-  set(CPM_DOWNLOAD_LOCATION
-      "$ENV{CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake")
+    set(CPM_DOWNLOAD_LOCATION "$ENV{CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake")
 else()
-  set(CPM_DOWNLOAD_LOCATION
-      "${CMAKE_BINARY_DIR}/cmake/CPM_${CPM_DOWNLOAD_VERSION}.cmake")
+    set(CPM_DOWNLOAD_LOCATION "${CMAKE_BINARY_DIR}/cmake/CPM_${CPM_DOWNLOAD_VERSION}.cmake")
 endif()
 
 if(NOT (EXISTS ${CPM_DOWNLOAD_LOCATION}))
-  message(STATUS "Downloading CPM.cmake to ${CPM_DOWNLOAD_LOCATION}")
-  file(
-    DOWNLOAD
-    https://github.com/cpm-cmake/CPM.cmake/releases/download/v${CPM_DOWNLOAD_VERSION}/CPM.cmake
-    ${CPM_DOWNLOAD_LOCATION})
+    message(STATUS "Downloading CPM.cmake to ${CPM_DOWNLOAD_LOCATION}")
+    file(
+        DOWNLOAD
+            https://github.com/cpm-cmake/CPM.cmake/releases/download/v${CPM_DOWNLOAD_VERSION}/CPM.cmake
+            ${CPM_DOWNLOAD_LOCATION}
+    )
 endif()
 
 include(${CPM_DOWNLOAD_LOCATION})
diff --git a/cmake/setupDUCC.cmake b/cmake/setupDUCC.cmake
index 1d9f0f85c..89c3a093c 100644
--- a/cmake/setupDUCC.cmake
+++ b/cmake/setupDUCC.cmake
@@ -1,46 +1,47 @@
-cpmaddpackage(
-  NAME
-  ducc0
-  GIT_REPOSITORY
-  https://gitlab.mpcdf.mpg.de/mtr/ducc.git
-  GIT_TAG
-  ${DUCC0_VERSION}
-  DOWNLOAD_ONLY
-  YES)
+CPMAddPackage(
+    NAME
+    ducc0
+    GIT_REPOSITORY
+    https://gitlab.mpcdf.mpg.de/mtr/ducc.git
+    GIT_TAG
+    ${DUCC0_VERSION}
+    DOWNLOAD_ONLY
+    YES
+)
 
 if(ducc0_ADDED)
-  add_library(
-    ducc0 STATIC
-    ${ducc0_SOURCE_DIR}/src/ducc0/infra/string_utils.cc
-    ${ducc0_SOURCE_DIR}/src/ducc0/infra/threading.cc
-    ${ducc0_SOURCE_DIR}/src/ducc0/infra/mav.cc
-    ${ducc0_SOURCE_DIR}/src/ducc0/math/gridding_kernel.cc
-    ${ducc0_SOURCE_DIR}/src/ducc0/math/gl_integrator.cc)
-  target_include_directories(ducc0 PUBLIC ${ducc0_SOURCE_DIR}/src/)
-  target_compile_options(
-    ducc0 PRIVATE $<$<CONFIG:Release,RelWithDebInfo>:${FINUFFT_ARCH_FLAGS}>)
-  target_compile_options(
-    ducc0 PRIVATE $<$<CONFIG:Release>:${FINUFFT_CXX_FLAGS_RELEASE}>)
-  target_compile_options(
-    ducc0
-    PRIVATE $<$<CONFIG:RelWithDebInfo>:${FINUFFT_CXX_FLAGS_RELWITHDEBINFO}>)
-  target_compile_features(ducc0 PRIVATE cxx_std_17)
-  # private because we do not want to propagate this requirement
-  set_target_properties(
-    ducc0
-    PROPERTIES MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>"
-               POSITION_INDEPENDENT_CODE ${FINUFFT_POSITION_INDEPENDENT_CODE})
-  check_cxx_compiler_flag(-ffast-math HAS_FAST_MATH)
-  if(HAS_FAST_MATH)
-    target_compile_options(ducc0 PRIVATE -ffast-math)
-  endif()
-  check_cxx_compiler_flag(/fp:fast HAS_FP_FAST)
-  if(HAS_FP_FAST)
-    target_compile_options(ducc0 PRIVATE /fp:fast)
-  endif()
-  if(NOT OpenMP_CXX_FOUND)
-    find_package(Threads REQUIRED)
-    target_link_libraries(ducc0 PRIVATE Threads::Threads)
-  endif()
-  enable_asan(ducc0)
+    add_library(
+        ducc0
+        STATIC
+        ${ducc0_SOURCE_DIR}/src/ducc0/infra/string_utils.cc
+        ${ducc0_SOURCE_DIR}/src/ducc0/infra/threading.cc
+        ${ducc0_SOURCE_DIR}/src/ducc0/infra/mav.cc
+        ${ducc0_SOURCE_DIR}/src/ducc0/math/gridding_kernel.cc
+        ${ducc0_SOURCE_DIR}/src/ducc0/math/gl_integrator.cc
+    )
+    target_include_directories(ducc0 PUBLIC ${ducc0_SOURCE_DIR}/src/)
+    target_compile_options(ducc0 PRIVATE $<$<CONFIG:Release,RelWithDebInfo>:${FINUFFT_ARCH_FLAGS}>)
+    target_compile_options(ducc0 PRIVATE $<$<CONFIG:Release>:${FINUFFT_CXX_FLAGS_RELEASE}>)
+    target_compile_options(ducc0 PRIVATE $<$<CONFIG:RelWithDebInfo>:${FINUFFT_CXX_FLAGS_RELWITHDEBINFO}>)
+    target_compile_features(ducc0 PRIVATE cxx_std_17)
+    # private because we do not want to propagate this requirement
+    set_target_properties(
+        ducc0
+        PROPERTIES
+            MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>"
+            POSITION_INDEPENDENT_CODE ${FINUFFT_POSITION_INDEPENDENT_CODE}
+    )
+    check_cxx_compiler_flag(-ffast-math HAS_FAST_MATH)
+    if(HAS_FAST_MATH)
+        target_compile_options(ducc0 PRIVATE -ffast-math)
+    endif()
+    check_cxx_compiler_flag(/fp:fast HAS_FP_FAST)
+    if(HAS_FP_FAST)
+        target_compile_options(ducc0 PRIVATE /fp:fast)
+    endif()
+    if(NOT OpenMP_CXX_FOUND)
+        find_package(Threads REQUIRED)
+        target_link_libraries(ducc0 PRIVATE Threads::Threads)
+    endif()
+    enable_asan(ducc0)
 endif()
diff --git a/cmake/setupFFTW.cmake b/cmake/setupFFTW.cmake
index e95f8885d..3e6d88063 100644
--- a/cmake/setupFFTW.cmake
+++ b/cmake/setupFFTW.cmake
@@ -1,87 +1,91 @@
-cpmaddpackage(
-  NAME
-  findfftw
-  GIT_REPOSITORY
-  "https://github.com/egpbos/findFFTW.git"
-  GIT_TAG
-  "master"
-  EXCLUDE_FROM_ALL
-  YES
-  GIT_SHALLOW
-  YES)
+CPMAddPackage(
+    NAME
+    findfftw
+    GIT_REPOSITORY
+    "https://github.com/egpbos/findFFTW.git"
+    GIT_TAG
+    "master"
+    EXCLUDE_FROM_ALL
+    YES
+    GIT_SHALLOW
+    YES
+)
 
 list(APPEND CMAKE_MODULE_PATH "${findfftw_SOURCE_DIR}")
 
-if(FINUFFT_FFTW_LIBRARIES STREQUAL DEFAULT OR FINUFFT_FFTW_LIBRARIES STREQUAL
-                                              DOWNLOAD)
-  find_package(FFTW)
-  if((NOT FFTW_FOUND) OR (FINUFFT_FFTW_LIBRARIES STREQUAL DOWNLOAD))
-    if(FINUFFT_FFTW_SUFFIX STREQUAL THREADS)
-      set(FINUFFT_USE_THREADS ON)
-    else()
-      set(FINUFFT_USE_THREADS OFF)
-    endif()
-    cpmaddpackage(
-      NAME
-      fftw3
-      OPTIONS
-      "ENABLE_SSE2 ON"
-      "ENABLE_AVX ON"
-      "ENABLE_AVX2 ON"
-      "BUILD_TESTS OFF"
-      "BUILD_SHARED_LIBS OFF"
-      "ENABLE_THREADS ${FINUFFT_USE_THREADS}"
-      "ENABLE_OPENMP ${FINUFFT_USE_OPENMP}"
-      URL
-      "http://www.fftw.org/fftw-${FFTW_VERSION}.tar.gz"
-      URL_HASH
-      "MD5=8ccbf6a5ea78a16dbc3e1306e234cc5c"
-      EXCLUDE_FROM_ALL
-      YES
-      GIT_SHALLOW
-      YES)
-
-    cpmaddpackage(
-      NAME
-      fftw3f
-      OPTIONS
-      "ENABLE_SSE2 ON"
-      "ENABLE_AVX ON"
-      "ENABLE_AVX2 ON"
-      "ENABLE_FLOAT ON"
-      "BUILD_TESTS OFF"
-      "BUILD_SHARED_LIBS OFF"
-      "ENABLE_THREADS ${FINUFFT_USE_THREADS}"
-      "ENABLE_OPENMP ${FINUFFT_USE_OPENMP}"
-      URL
-      "http://www.fftw.org/fftw-${FFTW_VERSION}.tar.gz"
-      URL_HASH
-      "MD5=8ccbf6a5ea78a16dbc3e1306e234cc5c"
-      EXCLUDE_FROM_ALL
-      YES
-      GIT_SHALLOW
-      YES)
-    set(FINUFFT_FFTW_LIBRARIES fftw3 fftw3f)
-    if(FINUFFT_USE_THREADS)
-      list(APPEND FINUFFT_FFTW_LIBRARIES fftw3_threads fftw3f_threads)
-    elseif(FINUFFT_USE_OPENMP)
-      list(APPEND FINUFFT_FFTW_LIBRARIES fftw3_omp fftw3f_omp)
-    endif()
+if(FINUFFT_FFTW_LIBRARIES STREQUAL DEFAULT OR FINUFFT_FFTW_LIBRARIES STREQUAL DOWNLOAD)
+    find_package(FFTW)
+    if((NOT FFTW_FOUND) OR (FINUFFT_FFTW_LIBRARIES STREQUAL DOWNLOAD))
+        if(FINUFFT_FFTW_SUFFIX STREQUAL THREADS)
+            set(FINUFFT_USE_THREADS ON)
+        else()
+            set(FINUFFT_USE_THREADS OFF)
+        endif()
+        CPMAddPackage(
+            NAME
+            fftw3
+            OPTIONS
+            "ENABLE_SSE2 ON"
+            "ENABLE_AVX ON"
+            "ENABLE_AVX2 ON"
+            "BUILD_TESTS OFF"
+            "BUILD_SHARED_LIBS OFF"
+            "ENABLE_THREADS ${FINUFFT_USE_THREADS}"
+            "ENABLE_OPENMP ${FINUFFT_USE_OPENMP}"
+            URL
+            "http://www.fftw.org/fftw-${FFTW_VERSION}.tar.gz"
+            URL_HASH
+            "MD5=8ccbf6a5ea78a16dbc3e1306e234cc5c"
+            EXCLUDE_FROM_ALL
+            YES
+            GIT_SHALLOW
+            YES
+        )
 
-    foreach(element IN LISTS FINUFFT_FFTW_LIBRARIES)
-      set_target_properties(
-        ${element}
-        PROPERTIES MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>"
-                   POSITION_INDEPENDENT_CODE
-                   ${FINUFFT_POSITION_INDEPENDENT_CODE})
-    endforeach()
+        CPMAddPackage(
+            NAME
+            fftw3f
+            OPTIONS
+            "ENABLE_SSE2 ON"
+            "ENABLE_AVX ON"
+            "ENABLE_AVX2 ON"
+            "ENABLE_FLOAT ON"
+            "BUILD_TESTS OFF"
+            "BUILD_SHARED_LIBS OFF"
+            "ENABLE_THREADS ${FINUFFT_USE_THREADS}"
+            "ENABLE_OPENMP ${FINUFFT_USE_OPENMP}"
+            URL
+            "http://www.fftw.org/fftw-${FFTW_VERSION}.tar.gz"
+            URL_HASH
+            "MD5=8ccbf6a5ea78a16dbc3e1306e234cc5c"
+            EXCLUDE_FROM_ALL
+            YES
+            GIT_SHALLOW
+            YES
+        )
+        set(FINUFFT_FFTW_LIBRARIES fftw3 fftw3f)
+        if(FINUFFT_USE_THREADS)
+            list(APPEND FINUFFT_FFTW_LIBRARIES fftw3_threads fftw3f_threads)
+        elseif(FINUFFT_USE_OPENMP)
+            list(APPEND FINUFFT_FFTW_LIBRARIES fftw3_omp fftw3f_omp)
+        endif()
 
-    target_include_directories(
-      fftw3 PUBLIC $<BUILD_INTERFACE:${fftw3_SOURCE_DIR}/api>)
+        foreach(element IN LISTS FINUFFT_FFTW_LIBRARIES)
+            set_target_properties(
+                ${element}
+                PROPERTIES
+                    MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>"
+                    POSITION_INDEPENDENT_CODE ${FINUFFT_POSITION_INDEPENDENT_CODE}
+            )
+        endforeach()
 
-  else()
-    set(FINUFFT_FFTW_LIBRARIES
-        "FFTW::Float" "FFTW::Double" "FFTW::Float${FINUFFT_FFTW_SUFFIX}"
-        "FFTW::Double${FINUFFT_FFTW_SUFFIX}")
-  endif()
+        target_include_directories(fftw3 PUBLIC $<BUILD_INTERFACE:${fftw3_SOURCE_DIR}/api>)
+    else()
+        set(FINUFFT_FFTW_LIBRARIES
+            "FFTW::Float"
+            "FFTW::Double"
+            "FFTW::Float${FINUFFT_FFTW_SUFFIX}"
+            "FFTW::Double${FINUFFT_FFTW_SUFFIX}"
+        )
+    endif()
 endif()
diff --git a/cmake/setupSphinx.cmake b/cmake/setupSphinx.cmake
index 996419b5c..30270da9e 100644
--- a/cmake/setupSphinx.cmake
+++ b/cmake/setupSphinx.cmake
@@ -1,21 +1,23 @@
-cpmaddpackage(
-  NAME
-  sphinx_cmake
-  GIT_REPOSITORY
-  https://github.com/k0ekk0ek/cmake-sphinx.git
-  GIT_TAG
-  e13c40a
-  DOWNLOAD_ONLY
-  YES)
+CPMAddPackage(
+    NAME
+    sphinx_cmake
+    GIT_REPOSITORY
+    https://github.com/k0ekk0ek/cmake-sphinx.git
+    GIT_TAG
+    e13c40a
+    DOWNLOAD_ONLY
+    YES
+)
 
 list(APPEND CMAKE_MODULE_PATH ${sphinx_cmake_SOURCE_DIR}/cmake/Modules)
 
 # requires pip install sphinx texext
 find_package(Sphinx)
 if(SPHINX_FOUND)
-  message(STATUS "Sphinx found")
-  sphinx_add_docs(finufft_sphinx BUILDER html SOURCE_DIRECTORY
-                  ${FINUFFT_SOURCE_DIR}/docs)
+    message(STATUS "Sphinx found")
+    sphinx_add_docs(finufft_sphinx BUILDER html SOURCE_DIRECTORY
+                    ${FINUFFT_SOURCE_DIR}/docs
+    )
 else()
-  message(STATUS "Sphinx not found docs will not be generated")
+    message(STATUS "Sphinx not found docs will not be generated")
 endif()
diff --git a/cmake/setupXSIMD.cmake b/cmake/setupXSIMD.cmake
index 5f31ae61e..475412514 100644
--- a/cmake/setupXSIMD.cmake
+++ b/cmake/setupXSIMD.cmake
@@ -1,28 +1,30 @@
-cpmaddpackage(
-  NAME
-  xtl
-  GIT_REPOSITORY
-  "https://github.com/xtensor-stack/xtl.git"
-  GIT_TAG
-  ${XTL_VERSION}
-  EXCLUDE_FROM_ALL
-  YES
-  GIT_SHALLOW
-  YES
-  OPTIONS
-  "XTL_DISABLE_EXCEPTIONS YES")
+CPMAddPackage(
+    NAME
+    xtl
+    GIT_REPOSITORY
+    "https://github.com/xtensor-stack/xtl.git"
+    GIT_TAG
+    ${XTL_VERSION}
+    EXCLUDE_FROM_ALL
+    YES
+    GIT_SHALLOW
+    YES
+    OPTIONS
+    "XTL_DISABLE_EXCEPTIONS YES"
+)
 
-cpmaddpackage(
-  NAME
-  xsimd
-  GIT_REPOSITORY
-  "https://github.com/xtensor-stack/xsimd.git"
-  GIT_TAG
-  ${XSIMD_VERSION}
-  EXCLUDE_FROM_ALL
-  YES
-  GIT_SHALLOW
-  YES
-  OPTIONS
-  "XSIMD_SKIP_INSTALL YES"
-  "XSIMD_ENABLE_XTL_COMPLEX YES")
+CPMAddPackage(
+    NAME
+    xsimd
+    GIT_REPOSITORY
+    "https://github.com/xtensor-stack/xsimd.git"
+    GIT_TAG
+    ${XSIMD_VERSION}
+    EXCLUDE_FROM_ALL
+    YES
+    GIT_SHALLOW
+    YES
+    OPTIONS
+    "XSIMD_SKIP_INSTALL YES"
+    "XSIMD_ENABLE_XTL_COMPLEX YES"
+)
diff --git a/cmake/utils.cmake b/cmake/utils.cmake
index 5aab67dc7..bcbbb5169 100644
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -2,88 +2,76 @@ include(CheckCXXCompilerFlag)
 
 # Define the function
 function(filter_supported_compiler_flags input_flags_var output_flags_var)
-  # Create an empty list to store supported flags
-  set(supported_flags)
-  # Iterate over each flag in the input list
-  set(ORIGINAL_LINKER_FLAGS ${CMAKE_EXE_LINKER_FLAGS})
-  foreach(flag ${${input_flags_var}})
-    string(REPLACE "=" "_" flag_var ${flag}) # Convert flag to a valid variable
-    # name
-    string(REPLACE "-" "" flag_var ${flag_var}) # Remove '-' for the variable
-    # name Append the test linker flag to the existing flags
-    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${flag}")
-    check_cxx_compiler_flag(${flag} ${flag_var})
-    if(${flag_var})
-      # If supported, append the flag to the list of supported flags
-      list(APPEND supported_flags ${flag})
-    else()
-      message(STATUS "Flag ${flag} is not supported")
-    endif()
-    unset(${flag_var} CACHE)
-    # remove last flag from CMAKE_EXE_LINKER_FLAGS using substring
-    set(CMAKE_EXE_LINKER_FLAGS ${ORIGINAL_LINKER_FLAGS})
-  endforeach()
-  # Set the output variable to the list of supported flags
-  set(${output_flags_var}
-      ${supported_flags}
-      PARENT_SCOPE)
+    # Create an empty list to store supported flags
+    set(supported_flags)
+    # Iterate over each flag in the input list
+    set(ORIGINAL_LINKER_FLAGS ${CMAKE_EXE_LINKER_FLAGS})
+    foreach(flag ${${input_flags_var}})
+        string(REPLACE "=" "_" flag_var ${flag}) # Convert flag to a valid variable
+        # name
+        string(REPLACE "-" "" flag_var ${flag_var}) # Remove '-' for the variable
+        # name Append the test linker flag to the existing flags
+        set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${flag}")
+        check_cxx_compiler_flag(${flag} ${flag_var})
+        if(${flag_var})
+            # If supported, append the flag to the list of supported flags
+            list(APPEND supported_flags ${flag})
+        else()
+            message(STATUS "Flag ${flag} is not supported")
+        endif()
+        unset(${flag_var} CACHE)
+        # remove last flag from CMAKE_EXE_LINKER_FLAGS using substring
+        set(CMAKE_EXE_LINKER_FLAGS ${ORIGINAL_LINKER_FLAGS})
+    endforeach()
+    # Set the output variable to the list of supported flags
+    set(${output_flags_var} ${supported_flags} PARENT_SCOPE)
 endfunction()
 
 function(check_arch_support)
-  message(STATUS "Checking for AVX, AVX512 and SSE support")
-  try_run(
-    RUN_RESULT_VAR COMPILE_RESULT_VAR ${CMAKE_BINARY_DIR}
-    ${CMAKE_CURRENT_SOURCE_DIR}/cmake/CheckAVX.cpp
-    COMPILE_OUTPUT_VARIABLE COMPILE_OUTPUT
-    RUN_OUTPUT_VARIABLE RUN_OUTPUT)
-  if(RUN_OUTPUT MATCHES "AVX512")
-    set(FINUFFT_ARCH_FLAGS
-        "/arch:AVX512"
-        CACHE STRING "" FORCE)
-  elseif(RUN_OUTPUT MATCHES "AVX2")
-    set(FINUFFT_ARCH_FLAGS
-        "/arch:AVX2"
-        CACHE STRING "" FORCE)
-  elseif(RUN_OUTPUT MATCHES "AVX")
-    set(FINUFFT_ARCH_FLAGS
-        "/arch:AVX"
-        CACHE STRING "" FORCE)
-  elseif(RUN_OUTPUT MATCHES "SSE")
-    set(FINUFFT_ARCH_FLAGS
-        "/arch:SSE"
-        CACHE STRING "" FORCE)
-  else()
-    set(FINUFFT_ARCH_FLAGS
-        ""
-        CACHE STRING "" FORCE)
-  endif()
-  message(STATUS "CPU supports: ${RUN_OUTPUT}")
-  message(STATUS "Using MSVC flags: ${FINUFFT_ARCH_FLAGS}")
+    message(STATUS "Checking for AVX, AVX512 and SSE support")
+    try_run(
+        RUN_RESULT_VAR
+        COMPILE_RESULT_VAR
+        ${CMAKE_BINARY_DIR}
+        ${CMAKE_CURRENT_SOURCE_DIR}/cmake/CheckAVX.cpp
+        COMPILE_OUTPUT_VARIABLE COMPILE_OUTPUT
+        RUN_OUTPUT_VARIABLE RUN_OUTPUT
+    )
+    if(RUN_OUTPUT MATCHES "AVX512")
+        set(FINUFFT_ARCH_FLAGS "/arch:AVX512" CACHE STRING "" FORCE)
+    elseif(RUN_OUTPUT MATCHES "AVX2")
+        set(FINUFFT_ARCH_FLAGS "/arch:AVX2" CACHE STRING "" FORCE)
+    elseif(RUN_OUTPUT MATCHES "AVX")
+        set(FINUFFT_ARCH_FLAGS "/arch:AVX" CACHE STRING "" FORCE)
+    elseif(RUN_OUTPUT MATCHES "SSE")
+        set(FINUFFT_ARCH_FLAGS "/arch:SSE" CACHE STRING "" FORCE)
+    else()
+        set(FINUFFT_ARCH_FLAGS "" CACHE STRING "" FORCE)
+    endif()
+    message(STATUS "CPU supports: ${RUN_OUTPUT}")
+    message(STATUS "Using MSVC flags: ${FINUFFT_ARCH_FLAGS}")
 endfunction()
 
 function(copy_dll source_target destination_target)
-  if(NOT WIN32)
-    return()
-  endif()
-  # Get the binary directory of the destination target
-  get_target_property(DESTINATION_DIR ${destination_target} BINARY_DIR)
-  set(DESTINATION_FILE ${DESTINATION_DIR}/$<TARGET_FILE_NAME:${source_target}>)
-  if(NOT EXISTS ${DESTINATION_FILE})
-    message(
-      STATUS
-        "Copying ${source_target} to ${DESTINATION_DIR} directory for ${destination_target}"
-    )
-    # Define the custom command to copy the source target to the destination
-    # directory
-    add_custom_command(
-      TARGET ${destination_target}
-      POST_BUILD
-      COMMAND ${CMAKE_COMMAND} -E copy $<TARGET_FILE:${source_target}>
-              ${DESTINATION_FILE}
-      COMMENT "Copying ${source_target} to ${destination_target} directory")
-  endif()
-  # Unset the variables to leave a clean state
-  unset(DESTINATION_DIR)
-  unset(SOURCE_FILE)
-  unset(DESTINATION_FILE)
+    if(NOT WIN32)
+        return()
+    endif()
+    # Get the binary directory of the destination target
+    get_target_property(DESTINATION_DIR ${destination_target} BINARY_DIR)
+    set(DESTINATION_FILE ${DESTINATION_DIR}/$<TARGET_FILE_NAME:${source_target}>)
+    if(NOT EXISTS ${DESTINATION_FILE})
+        message(STATUS "Copying ${source_target} to ${DESTINATION_DIR} directory for ${destination_target}")
+        # Define the custom command to copy the source target to the destination
+        # directory
+        add_custom_command(
+            TARGET ${destination_target}
+            POST_BUILD
+            COMMAND ${CMAKE_COMMAND} -E copy $<TARGET_FILE:${source_target}> ${DESTINATION_FILE}
+            COMMENT "Copying ${source_target} to ${destination_target} directory"
+        )
+    endif()
+    # Unset the variables to leave a clean state
+    unset(DESTINATION_DIR)
+    unset(SOURCE_FILE)
+    unset(DESTINATION_FILE)
 endfunction()
diff --git a/contributing.md b/contributing.md
index ad79e9abf..14f2abe53 100644
--- a/contributing.md
+++ b/contributing.md
@@ -1,4 +1,4 @@
-This repository is formatted according to the .clang-format in the root directory. 
+This repository is formatted according to the .clang-format in the root directory.
 Please enable the reformatting hook before committing your changes.
 See [pre-commit](https://pre-commit.com/) for more information.
 A quick summary:
diff --git a/devel/CMakeLists.txt b/devel/CMakeLists.txt
index 45b9a5989..d9d2251bd 100644
--- a/devel/CMakeLists.txt
+++ b/devel/CMakeLists.txt
@@ -3,19 +3,20 @@ project(finufft_devel)
 cmake_minimum_required(VERSION 3.5)
 
 # include cpm cmake, downloading it
-cpmaddpackage(
-  NAME
-  benchmark
-  GITHUB_REPOSITORY
-  google/benchmark
-  VERSION
-  1.8.3
-  OPTIONS
-  "BENCHMARK_ENABLE_TESTING OFF")
+CPMAddPackage(
+    NAME
+    benchmark
+    GITHUB_REPOSITORY
+    google/benchmark
+    VERSION
+    1.8.3
+    OPTIONS
+    "BENCHMARK_ENABLE_TESTING OFF"
+)
 
 if(benchmark_ADDED)
-  # patch benchmark target
-  set_target_properties(benchmark PROPERTIES CXX_STANDARD 17)
+    # patch benchmark target
+    set_target_properties(benchmark PROPERTIES CXX_STANDARD 17)
 endif()
 
 add_executable(foldrescale foldrescale.cpp)
diff --git a/devel/GuruInterfaceBranch_BrainDump b/devel/GuruInterfaceBranch_BrainDump
index f8ef8fd49..43a2fba8e 100644
--- a/devel/GuruInterfaceBranch_BrainDump
+++ b/devel/GuruInterfaceBranch_BrainDump
@@ -19,7 +19,7 @@ the makefile line "CFLAGS += -I include" ensures all headers inside of this "inc
 available to a file if they are referred to with the former syntax.
 
 |nufft_opts.h| : Additional parameter spread_scheme {0:Sequential Multithreaded, 1: Nested Multithreaded}.
-	         The default spread_scheme is set to 0. 
+	         The default spread_scheme is set to 0.
 
 **Src Directory**
 
@@ -45,14 +45,14 @@ available to a file if they are referred to with the former syntax.
               1) finufft_default_opts(): identical to original version + extra spreading_scheme parameter set default to 0 : sequential
 	      	 		         multithreaded scheme. Set to 1 to switch to simultaneous single threaded with nested at
 					 the end. Description of schemes below.
-					 
+
               2) finufft_makeplan(): sets the fields of the plan struct.
 
 	      	 		     ==================================Types 1 + 2=============================================
 	      	 		     Allocates the internal fw array that is the workspace for fftw_execute(). For cases where
 				     n_trans > 1, user supplied c and F arrays are stacked arrays of size M*n_transf and N*n_transf
 				     respectively. Because finufft_exec() performs its steps in batches of size threadBlkSize
-				     (a parameter for this function)  Internal arrays like fw (and cpj, see finufft_exec) are still 
+				     (a parameter for this function)  Internal arrays like fw (and cpj, see finufft_exec) are still
 				     stacked, but only threadBlkSize times. I.e. in 1d fw is of size nf1*threadBlkSize.
 				     Sets of weights are spread to the uniform grid threadBlkSize at a time. This holds true for all
 				     n_transf >= threadBlkSize. If n_transf < threadBlkSize, then internal arrays are only stacked
@@ -68,9 +68,9 @@ available to a file if they are referred to with the former syntax.
 				     to not run into trouble, however the contents of the array at the end are garbage- never filled
 				     in with sets of spread/interpolated weights. They are not read out either, so there is no effect.
 				     This course was chosen instead of creating a new fftw_plan on the last round with a different
-				     "how_many" and calling fftw_exec on the new plan. 
-				     
-              3) finufft_setpts():  sets the coordinate pointers with the provided pts. 
+				     "how_many" and calling fftw_exec on the new plan.
+
+              3) finufft_setpts():  sets the coordinate pointers with the provided pts.
 
 	      	 		     ==================================Type 1 + 2================================================
 
@@ -89,15 +89,15 @@ available to a file if they are referred to with the former syntax.
 
 				    The target frequencies are scaled, and the spreading kernel fourier weights are computed. PhiHat
 				    is the name of the array in the finufft_plan struct referred to as fwker for type 1+2 and fkker
-				    in type 3 in older code. 
-				  
-	      	 		   
+				    in type 3 in older code.
+
+
               4) finufft_exec(): Main body for both types is contained in a loop over the number of batches.
 	      	 		 Inside are calls to spread, interp, deconvolve, prephase as appropriate for type, and fftw_exec.
 	      	                 nBatches = ceil(n_transf/threadBlkSize). The number of sets in a batch is equal to
 				 min(threadBlkSize, n_transf). Where n_transf > threadBlkSize, i.e. nBatches > 1,
 				 for the last batch, the number of sets in the batch is modulus n_transf/threadBlkSize.
-				 The number of sets in a batch is the number times that the spread, interpolated, deconvolve, 
+				 The number of sets in a batch is the number times that the spread, interpolated, deconvolve,
  				 and prephase routines execute. Thus there are no extra calls to any of these functions on the
 				 last loop of execution. Recall that this is not the case for fftw_exec - internally it will do
 				 more work than necessary on the last round if nBatches > 1.
@@ -134,10 +134,10 @@ available to a file if they are referred to with the former syntax.
 				  because the type 2 n_transf is always less than or equal to the threadBlkSize. On the last
 				  type 3 execute batch, the interior type 2 n_transf field must be set to the modulus of
 				  n_transf/threadBlkSize, because the cj and fk arrays provided by the user are only the
-				  stacked to {M,N}*n_transf, so extra work at the end would try to access outside of allocated memory. 
+				  stacked to {M,N}*n_transf, so extra work at the end would try to access outside of allocated memory.
 
 
-	      5) finnuft_destroy(): releases all memory allocated internally by finufft and calls fftw_destroy on the plan. 
+	      5) finnuft_destroy(): releases all memory allocated internally by finufft and calls fftw_destroy on the plan.
 
 
 **Test Directory**
@@ -149,7 +149,7 @@ available to a file if they are referred to with the former syntax.
 
 |dumbinputs| : has been expanded to check bad inputs across the extended legacy interface
 
-|finufftGuru_test| : Main demo code for the guru interface and test program executable. 
+|finufftGuru_test| : Main demo code for the guru interface and test program executable.
 
                      Note on Usage: spread_scheme parameter after the debug that allows user to choose sequential multithreaded
                      spread vs simultaneous single threaded+nested last. Default is sequential multithreaded spread/interp.
@@ -173,7 +173,7 @@ available to a file if they are referred to with the former syntax.
 
 			 if n_trials = [1,10,100] at the top of the script
 			 pattern of any given printed array: [dim1_1trial, dim1_10trials, dim1_100trials, dim2_1trial, etc ]
-			 
+
 			 Run this on a rusty node and you should find total time ratios ~>= 1 for *almost* all of the big problems
 			 defaulted in this script, indicating that the old implementation took longer than the new.
 
@@ -185,7 +185,7 @@ available to a file if they are referred to with the former syntax.
 			 n_trials grows, evidence for completion of a primary goal: reduce finufft_plan retrieval overhead time
 			 in repeated executions. ***
 
-			 fftw_exec times are printed last. Same sanity check as above applies. 
+			 fftw_exec times are printed last. Same sanity check as above applies.
 
 |spreadingSchemeStats.py| : this script runs a single small problem and a single large problem for the two spreading schemes.
 			    See #4 of Timing Questions and Conclusions for further information on the schemes. The output of
@@ -204,11 +204,11 @@ available to a file if they are referred to with the former syntax.
    The file timingResults/timingBreakdowns_largeProblems.out contain stats reported from timingBreakdowns.py for a problem size 1e7.
    The files timingResults/timingBreakdowns_smallProblems_* contain stats reported for timingBreaddowns.py for a problem of size 1e4.
    All of these show a decrease in time spent in fftw_plan linearly proportional to the number of transforms computed.
-   
+
 2) Speedup due to non-repeated sorting?
    The Spreading results section in timingResults/timingBreakdowns_sortingDominates.out, run with M = 1e7 and N = 1e4,
-   provides good evidence for the guru speedup due to time advantage of single sort over repeated sort. 
-   
+   provides good evidence for the guru speedup due to time advantage of single sort over repeated sort.
+
 3) FFTW and single vs many plan and execution?
    Does FFTW do better performing one fftw_execute many times in a loop or a single time over many sets?
    For large problems, many execution looks better than repeated single, however trials
@@ -252,7 +252,7 @@ available to a file if they are referred to with the former syntax.
    investigation. The code being referred to is identical in spreadAllSetsInBatch and interpAllSetsInBatch in finufft.cpp.
    The lines beginning at the declaration of the n_outerThreads variable down to #pragma omp parallel before the for-loop, and then
    the three lines at the bottom that reset nested to 0.
-   
+
 
 **Testing Gotchas...BEWARE**
 1) |FFTW library clean slate|: Even with fftw_destoy_plan(&plan) [inside of finufft_destroy] and fftw_forget_wisdom()
diff --git a/devel/agenda_11-27-23.txt b/devel/agenda_11-27-23.txt
index 6db640361..f2cb6a6c7 100644
--- a/devel/agenda_11-27-23.txt
+++ b/devel/agenda_11-27-23.txt
@@ -45,7 +45,7 @@ library.....
  - jax waiting on this
  - ask Robert: unclear why some fftw_free or alloc are mutexed and not others?
  - ahb tested w/ all make tasks.]
- 
+
 GPU: PR 330: deal w/ cuda streams     - difficult for ahb to test
  - jax waiting on this
  - multicoil MRI users have opinions
@@ -55,7 +55,7 @@ remove obsolete bkw-compat (nufft_opts) ?
 
 wrappers....
 
-Libin to fix (1,M) vs (M,) Py issue #367 
+Libin to fix (1,M) vs (M,) Py issue #367
 
 [Libin did PR for #359, but not closed? I tested py and closed]
 
@@ -141,4 +141,3 @@ CI is not mentioned.
 #569 ? onedim_fseries_kernel  ... was not brought in
 
 Can I kill .travis.yml ? (3 yrs old - what's it for?)
-
diff --git a/devel/check_dumbinputs.cmake b/devel/check_dumbinputs.cmake
index 4a722fa8b..8ef110189 100644
--- a/devel/check_dumbinputs.cmake
+++ b/devel/check_dumbinputs.cmake
@@ -2,18 +2,18 @@
 # used by test/CMakeLists.txt
 
 # pick place for stdout to be saved
-set(tempout
-  "${CMAKE_CURRENT_BINARY_DIR}/dumbinputs${CMAKE_ARGV3}.out"
-  )
+set(tempout "${CMAKE_CURRENT_BINARY_DIR}/dumbinputs${CMAKE_ARGV3}.out")
 
 # pipe dumbinputs there. Is it platform-indep?
 # https://stackoverflow.com/questions/36304289/how-to-use-redirection-in-cmake-add-test
 execute_process(COMMAND ${CMAKE_CURRENT_BINARY_DIR}/dumbinputs OUTPUT_FILE ${tempout})
 
 # diff the output against reference
-execute_process(COMMAND ${CMAKE_COMMAND} -E compare_files --ignore-eol ${tempout} ${CMAKE_CURRENT_SOURCE_DIR}/results/dumbinputs${CMAKE_ARGV3}.refout)
-
-
+execute_process(
+    COMMAND
+        ${CMAKE_COMMAND} -E compare_files --ignore-eol ${tempout}
+        ${CMAKE_CURRENT_SOURCE_DIR}/results/dumbinputs${CMAKE_ARGV3}.refout
+)
 
 # ================== the above was an example extra .cmake script
 # in an attempt at platform-indep.
@@ -21,35 +21,35 @@ execute_process(COMMAND ${CMAKE_COMMAND} -E compare_files --ignore-eol ${tempout
 # here are other ideas from the CMakeLists.txt by Alex, and Libin:
 
 if(0)
-# Here's where to save output of dumbinputs for later diff
-set(tempout
-  "${CMAKE_CURRENT_BINARY_DIR}/dumbinputs${SUFFIX}.out"
-  )
-# follow https://stackoverflow.com/questions/36304289/how-to-use-redirection-in-cmake-add-test
-# https://stackoverflow.com/questions/39960173/run-custom-shell-script-with-cmake
-# Is pipe here platform-indep? very hard to find out. sh or bash are not cross-platform :(
-add_custom_target(
-  run_dumbinputs_${PREC} ALL
-  COMMENT "Custom target run of dumbinputs${SUFFIX}..."
-  # this still spews stuff to stderr which looks broken to the user; it is not...
-  COMMAND OMP_NUM_THREADS=1 ${CMAKE_CURRENT_BINARY_DIR}/dumbinputs${SUFFIX} > ${tempout}
-  # I don't know if correctly using dependency: it needs the current executable...
-  DEPENDS dumbinputs${SUFFIX}
-  )
-# I considered add_test() as another way (fails since can't handle the pipe >)
-# and add_custom_command(), which fails to run unless add_custom_target used anyway.
-# annoyingly, add_test COMMAND is quite lame: does not allow piping, as above,
-# or multiple cmds
-# See: https://stackoverflow.com/questions/3065220/ctest-with-multiple-commands?rq=4
-add_test(
-  NAME check_dumbinputs_${PREC}
-  COMMAND ${CMAKE_COMMAND} -E compare_files ${tempout} ${CMAKE_CURRENT_SOURCE_DIR}/results/dumbinputs${SUFFIX}.refout
-  )
+    # Here's where to save output of dumbinputs for later diff
+    set(tempout "${CMAKE_CURRENT_BINARY_DIR}/dumbinputs${SUFFIX}.out")
+    # follow https://stackoverflow.com/questions/36304289/how-to-use-redirection-in-cmake-add-test
+    # https://stackoverflow.com/questions/39960173/run-custom-shell-script-with-cmake
+    # Is pipe here platform-indep? very hard to find out. sh or bash are not cross-platform :(
+    add_custom_target(
+        run_dumbinputs_${PREC}
+        ALL
+        COMMENT "Custom target run of dumbinputs${SUFFIX}..."
+        # this still spews stuff to stderr which looks broken to the user; it is not...
+        COMMAND OMP_NUM_THREADS=1 ${CMAKE_CURRENT_BINARY_DIR}/dumbinputs${SUFFIX} > ${tempout}
+        # I don't know if correctly using dependency: it needs the current executable...
+        DEPENDS dumbinputs${SUFFIX}
+    )
+    # I considered add_test() as another way (fails since can't handle the pipe >)
+    # and add_custom_command(), which fails to run unless add_custom_target used anyway.
+    # annoyingly, add_test COMMAND is quite lame: does not allow piping, as above,
+    # or multiple cmds
+    # See: https://stackoverflow.com/questions/3065220/ctest-with-multiple-commands?rq=4
+    add_test(
+        NAME check_dumbinputs_${PREC}
+        COMMAND
+            ${CMAKE_COMMAND} -E compare_files ${tempout} ${CMAKE_CURRENT_SOURCE_DIR}/results/dumbinputs${SUFFIX}.refout
+    )
 endif()
 
 # another way of using add_test, remove add_custom_target by calling the devel/check_dumbinputs.cmake
 # if move check_dumbinputs.cmake to test directory, ${CMAKE_SOURCE_DIR} should be changed to ${CMAKE_CURRENT_SOURCE_DIR}
 add_test(
-  NAME check_dumbinputs_${PREC}
-  COMMAND ${CMAKE_COMMAND} -P ${CMAKE_SOURCE_DIR}/devel/check_dumbinputs.cmake ${SUFFIX}
-  )
+    NAME check_dumbinputs_${PREC}
+    COMMAND ${CMAKE_COMMAND} -P ${CMAKE_SOURCE_DIR}/devel/check_dumbinputs.cmake ${SUFFIX}
+)
diff --git a/devel/compare_foldrescale_PR440_laptop5700U.txt b/devel/compare_foldrescale_PR440_laptop5700U.txt
index 0335cdec2..8a1ef5276 100644
--- a/devel/compare_foldrescale_PR440_laptop5700U.txt
+++ b/devel/compare_foldrescale_PR440_laptop5700U.txt
@@ -111,7 +111,7 @@ spreadinterp 1D, 1e+06 U pts, dir=2, tol=0.1: nspread=2
 
 1D Concl: single-thread 7% speedup interp (dir=2) - none to do with sorting
                         5% speedup spread dir=1.
-          multi-thread  no significant change (~1% level).       
+          multi-thread  no significant change (~1% level).
 
 Also noted: PR #440 compile time for spreadinterp.o is 10x longer than before (~5 sec)
 
@@ -144,7 +144,7 @@ spreadinterp 3D, 1e+06 U pts, dir=2, tol=0.1: nspread=2
 	t2 spreading loop: 	0.752 s
     1e+07 NU pts in 0.892 s 	1.12e+07 pts/s 	8.97e+07 spread pts/s
     max rel err in values at NU pts: 0.315
-    
+
 (base) alex@ross /home/alex/numerics/finufft> OMP_NUM_THREADS=8 perftest/spreadtestnd 3 1e8 1e6 1e-1 2 0 1
 setup_spreader (kerevalmeth=1) eps=0.1 sigma=2: chose ns=2 beta=4.4
 	sorted (1 threads):	1.7e-05 s
@@ -195,7 +195,7 @@ spreadinterp 3D, 1e+06 U pts, dir=2, tol=0.1: nspread=2
 	t2 spreading loop: 	0.687 s
     1e+07 NU pts in 0.829 s 	1.21e+07 pts/s 	9.65e+07 spread pts/s
     max rel err in values at NU pts: 0.315
-    
+
 (base) alex@ross /home/alex/numerics/finufft> OMP_NUM_THREADS=8 perftest/spreadtestnd 3 1e8 1e6 1e-1 2 0 1
 setup_spreader (kerevalmeth=1) eps=0.1 sigma=2: chose ns=2 beta=4.4
 	sorted (1 threads):	1.8e-05 s
@@ -226,4 +226,3 @@ concl: single-thread: spread no change; interp is 9% faster
 Overall: only affects single-core perf, and by 9% or less.
 
 (Of course, advantage of no 3pi-restriction is good too)
-
diff --git a/devel/cuda/cufinufft_tasks_meeting_Jun2023.txt b/devel/cuda/cufinufft_tasks_meeting_Jun2023.txt
index c80a53f08..803418950 100644
--- a/devel/cuda/cufinufft_tasks_meeting_Jun2023.txt
+++ b/devel/cuda/cufinufft_tasks_meeting_Jun2023.txt
@@ -114,8 +114,3 @@ Low priority, unless it helps rewrite gpu code.
   MKL is faster a bit than FFTW, not worth including.
    Pocketfft is not as fast (ducc.FFT is better). M Reinecke.
    kissFFT terrible & inaccurate.
-
-
-
-
-
diff --git a/devel/eval_ker_expts.cpp b/devel/eval_ker_expts.cpp
index 8da4a1699..36f22aaca 100644
--- a/devel/eval_ker_expts.cpp
+++ b/devel/eval_ker_expts.cpp
@@ -30,7 +30,7 @@ static inline void evaluate_kernel_vector(FLT *__restrict__ ker,
 #pragma omp simd
   for (int i = 0; i < N; i++) // Loop 1: Compute exponential arguments
     ker[i] = beta * sqrt(1.0 - c * args[i] * args[i]);
-    // ker[i] = beta * (1.0 - c*args[i]*args[i]);   // no-sqrt version
+  // ker[i] = beta * (1.0 - c*args[i]*args[i]);   // no-sqrt version
 
 #pragma omp simd
   for (int i = 0; i < N; i++) // Loop 2: Compute exponentials
diff --git a/devel/finufft_meeting-7-5-23.txt b/devel/finufft_meeting-7-5-23.txt
index 3e843d84c..691a54422 100644
--- a/devel/finufft_meeting-7-5-23.txt
+++ b/devel/finufft_meeting-7-5-23.txt
@@ -70,4 +70,3 @@ Reinecke perf improvements CPU: binsorting; interp in 2d/3d.
 1.3 legacy cuf repo.
 
 docs
-
diff --git a/devel/foldrescale_spreadtest1d_slower.txt b/devel/foldrescale_spreadtest1d_slower.txt
index 8b38e8463..2927b97cf 100644
--- a/devel/foldrescale_spreadtest1d_slower.txt
+++ b/devel/foldrescale_spreadtest1d_slower.txt
@@ -68,4 +68,3 @@ spreadinterp 1D, 8e+06 U pts, dir=2, tol=1e-06: nspread=7
 	t2 spreading loop: 	0.197 s
     8e+06 NU pts in 0.205 s 	3.9e+07 pts/s 	2.73e+08 spread pts/s
     max rel err in values at NU pts: 1.13e-06
-
diff --git a/devel/gen_all_horner_cpp_header.m b/devel/gen_all_horner_cpp_header.m
index d6d3a7e52..8d784971a 100644
--- a/devel/gen_all_horner_cpp_header.m
+++ b/devel/gen_all_horner_cpp_header.m
@@ -12,7 +12,7 @@
 
 for upsampfac = [2.0, 1.25]   % sigma: either 2 (default) or low (eg 5/4)
   fprintf('upsampfac = %g...\n',upsampfac)
-  
+
   ws = 2:16;           % list of widths (the driver, rather than tols)
                        % (must match MIN and MAX NSPREAD in source headers)
   if upsampfac==2
@@ -24,7 +24,7 @@
   fwrite(fid,sprintf('// fine-grid interval. Generated by gen_all_horner_cpp_header.m in devel/\n'));
   fwrite(fid,sprintf('// Authors: Alex Barnett, Ludvig af Klinteberg, Marco Barbone & Libin Lu.\n// (C) 2018--2024 The Simons Foundation, Inc.\n'));
   fwrite(fid,sprintf('#include <array>\n\n'));
-  
+
   usf_tag = sprintf('%d',100*upsampfac);  % string, follow Barbone convention: 200 or 125
   fwrite(fid,sprintf('template<class T, uint8_t w>\nconstexpr auto get_horner_coeffs_%s() noexcept {\n',usf_tag));
 
@@ -33,7 +33,7 @@
     [d,beta] = get_degree_and_beta(w,upsampfac);
     fprintf('w=%d\td=%d\tbeta=%.3g\n',w,d,beta);
     str = gen_ker_horner_loop_cpp_code(w,d,beta);  % code strings for this w (nc hardcoded)
-    
+
     if j==1                                % write switch statement
       fwrite(fid,sprintf('    if constexpr (w == %d) {\n',w));
     else
@@ -48,5 +48,5 @@
   fwrite(fid,sprintf('    static_assert(w <= %d, "w must be <= %d");\n',ws(end),ws(end)));
   fwrite(fid,sprintf('  };\n'));    % close all brackets
   fclose(fid);
-  
+
 end
diff --git a/devel/ker_ppval_coeff_mat.m b/devel/ker_ppval_coeff_mat.m
index 218d42393..34721eb89 100644
--- a/devel/ker_ppval_coeff_mat.m
+++ b/devel/ker_ppval_coeff_mat.m
@@ -1,5 +1,5 @@
 function C = ker_ppval_coeff_mat(w,d,be,o)
-% KER_PPVAL_COEFF_MAT  matrix of piecewise poly ES kernel coeffs 
+% KER_PPVAL_COEFF_MAT  matrix of piecewise poly ES kernel coeffs
 %
 % C = ker_ppval_coeff_mat(w,nterms,beta,opts)
 %
@@ -77,4 +77,3 @@
             w,d,tol,maxerr,maxerr/tol)
   end
 end
-
diff --git a/devel/padding.cpp b/devel/padding.cpp
index 95ace9e12..727023ba7 100644
--- a/devel/padding.cpp
+++ b/devel/padding.cpp
@@ -275,4 +275,4 @@ int main(int argc, char *argv[]) {
             << uint64_t(get_padding<double, 15 * 2>()) << std::endl;
 
   return 0;
-}
\ No newline at end of file
+}
diff --git a/devel/plans_fall23.txt b/devel/plans_fall23.txt
index 359d8cc8b..0cda26530 100644
--- a/devel/plans_fall23.txt
+++ b/devel/plans_fall23.txt
@@ -24,4 +24,3 @@ Robert + Joakim
 Go through Issues & prioritize.
 
 PRs.
-
diff --git a/devel/reverse_engineer_tol.m b/devel/reverse_engineer_tol.m
index a57f013ab..cae196735 100644
--- a/devel/reverse_engineer_tol.m
+++ b/devel/reverse_engineer_tol.m
@@ -5,9 +5,9 @@
 %
 %  For fixed upsampfac (aka sigma), must be the inverse function for
 %  how w is chosen from tol in spreadinterp.cpp:setup_spreader()
-  
+
 % Barnett 7/22/24
-  
+
 if upsampfac==2.0
   tol = 10^(1-w);
 else
diff --git a/devel/test_ker_ppval.cpp b/devel/test_ker_ppval.cpp
index 35d29d645..c044ac91c 100644
--- a/devel/test_ker_ppval.cpp
+++ b/devel/test_ker_ppval.cpp
@@ -61,9 +61,9 @@ static inline void evaluate_kernel_vector(FLT *ker, const FLT *args, const FLT b
 {
 #pragma omp simd
   for (int i = 0; i < w; i++) ker[i] = exp(beta * sqrt((FLT)1.0 - c * args[i] * args[i]));
-    // gcc 5.4 can't simd the combined loop, hence we split the
-    // out-of-support test to subsequent loop...
-    // This check loop prevents getting 0.2s (600 Meval/s):
+  // gcc 5.4 can't simd the combined loop, hence we split the
+  // out-of-support test to subsequent loop...
+  // This check loop prevents getting 0.2s (600 Meval/s):
 #pragma omp simd
   for (int i = 0; i < w; i++)
     if (fabs(args[i]) >= (FLT)w / 2) // note fabs not abs!
diff --git a/devel/v2spec.md b/devel/v2spec.md
index 0f1d01146..a15cacde3 100644
--- a/devel/v2spec.md
+++ b/devel/v2spec.md
@@ -47,7 +47,7 @@ int finufft1d1(BIGINT nj,FLT* xj,CPX* cj,int iflag,FLT eps,BIGINT ms,
   return ier;
 }
 
-int invokeGuruInterface(int n_dims, int type, int n_transf, BIGINT nj, 
+int invokeGuruInterface(int n_dims, int type, int n_transf, BIGINT nj,
 FLT* xj,FLT *yj, FLT *zj, CPX* cj,int iflag, FLT eps, BIGINT *n_modes, BIGINT nk, FLT *s, FLT *t,  FLT *u,  CPX* fk, nufft_opts *opts_ptr)
 {
   finufft_plan plan;
@@ -158,7 +158,7 @@ finufft.destroy(plan)
 N=2    # now N is interpreted as the dim ? or have separate dim argument??
 plan = finufft.plan(3, N, ntransf, isign, tol)
 # (set x,y, sx, sy)
-finufft.set_nu_pts(plan, x,y,None, sx,sy,None)  # here sx,sy,sz are output NU  
+finufft.set_nu_pts(plan, x,y,None, sx,sy,None)  # here sx,sy,sz are output NU
 finufft.execute(plan, c, f)    # reads c and writes to f.
 # note c, f size must match ntransf
 finufft.destroy(plan)
diff --git a/docs/Makefile b/docs/Makefile
index 4824c849a..bfded925e 100644
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -2,7 +2,7 @@
 #
 
 # You can set these variables from the command line.
-SPHINXOPTS    = 
+SPHINXOPTS    =
 SPHINXBUILD   = sphinx-build
 PAPER         =
 BUILDDIR      = _build
diff --git a/docs/c.rst b/docs/c.rst
index a8d7ab494..c0e773c91 100644
--- a/docs/c.rst
+++ b/docs/c.rst
@@ -38,7 +38,7 @@ then set default values by passing a pointer (here ``opts``) to the following::
 
  void finufft_default_opts(finufft_opts* opts)
  void finufftf_default_opts(finufft_opts* opts)
-   
+
   Set values in a NUFFT options struct to their default values.
 
 Be sure to use the first version for double-precision and the second for single-precision. You may then change options with, for example, ``opts->debug=1;``
@@ -73,9 +73,9 @@ with the word "many" in the function name) perform ``ntr`` transforms with the s
 
 
 .. _guru:
-             
+
 Guru plan interface
--------------------                   
+-------------------
 
 This provides more flexibility than the simple or vectorized interfaces.
 Any transform requires (at least)
@@ -90,4 +90,3 @@ Keep in mind that ``setpts`` retains *pointers* to the user's list of nonuniform
   The ``plan`` object (type ``finufft{f}_plan``) is an opaque pointer; the public interface specifies no more details that that. Under the hood in our library the plan happens to point to a C++ object of type ``finufft{f}_plan_s``, whose internal details the library user should not attempt to access, nor to rely on.
 
 .. include:: cguru.doc
-                    
diff --git a/docs/c1d.doc b/docs/c1d.doc
index c16bc6241..a12401cae 100644
--- a/docs/c1d.doc
+++ b/docs/c1d.doc
@@ -1,23 +1,23 @@
 ::
- 
- int finufft1d1(int64_t M, double* x, complex<double>* c, int iflag, double eps, int64_t 
+
+ int finufft1d1(int64_t M, double* x, complex<double>* c, int iflag, double eps, int64_t
  N1, complex<double>* f, finufft_opts* opts)
- int finufftf1d1(int64_t M, float* x, complex<float>* c, int iflag, float eps, int64_t N1, 
+ int finufftf1d1(int64_t M, float* x, complex<float>* c, int iflag, float eps, int64_t N1,
  complex<float>* f, finufftf_opts* opts)
- 
- int finufft1d1many(int ntr, int64_t M, double* x, complex<double>* c, int iflag, double 
+
+ int finufft1d1many(int ntr, int64_t M, double* x, complex<double>* c, int iflag, double
  eps, int64_t N1, complex<double>* f, finufft_opts* opts)
- int finufftf1d1many(int ntr, int64_t M, float* x, complex<float>* c, int iflag, float 
+ int finufftf1d1many(int ntr, int64_t M, float* x, complex<float>* c, int iflag, float
  eps, int64_t N1, complex<float>* f, finufftf_opts* opts)
- 
+
    1D complex nonuniform FFT of type 1 (nonuniform to uniform).
- 
+
    Computes to precision eps, via a fast algorithm, one or more transforms of the form:
- 
+
                M-1
       f[k1] =  SUM c[j] exp(+/-i k1 x(j))  for -N1/2 <= k1 <= (N1-1)/2
-               j=0            
- 
+               j=0
+
    Inputs:
      ntr    how many transforms (only for vectorized "many" functions, else ntr=1)
      M      number of nonuniform point sources
@@ -28,38 +28,38 @@
             from 1e-1 down to ~ 1e-14 (in double precision) or 1e-6 (in single)
      N1     number of output Fourier modes to be computed
      opts   pointer to options struct (see opts.rst), or NULL for defaults
- 
+
    Outputs:
      f      Fourier mode coefficients (size N1*ntr complex array)
      return value  0: success, 1: success but warning, >1: error (see error.rst)
- 
+
    Notes:
      * complex arrays interleave Re, Im values, and their size is stated with
        dimensions ordered fastest to slowest.
      * Fourier frequency indices in each dimension i are the integers lying
        in [-Ni/2, (Ni-1)/2]. See above, and modeord in opts.rst for possible orderings.
- 
- 
+
+
 ::
- 
- int finufft1d2(int64_t M, double* x, complex<double>* c, int iflag, double eps, int64_t 
+
+ int finufft1d2(int64_t M, double* x, complex<double>* c, int iflag, double eps, int64_t
  N1, complex<double>* f, finufft_opts* opts)
- int finufftf1d2(int64_t M, float* x, complex<float>* c, int iflag, float eps, int64_t N1, 
+ int finufftf1d2(int64_t M, float* x, complex<float>* c, int iflag, float eps, int64_t N1,
  complex<float>* f, finufftf_opts* opts)
- 
- int finufft1d2many(int ntr, int64_t M, double* x, complex<double>* c, int iflag, double 
+
+ int finufft1d2many(int ntr, int64_t M, double* x, complex<double>* c, int iflag, double
  eps, int64_t N1, complex<double>* f, finufft_opts* opts)
- int finufftf1d2many(int ntr, int64_t M, float* x, complex<float>* c, int iflag, float 
+ int finufftf1d2many(int ntr, int64_t M, float* x, complex<float>* c, int iflag, float
  eps, int64_t N1, complex<float>* f, finufftf_opts* opts)
- 
+
    1D complex nonuniform FFT of type 2 (uniform to nonuniform).
- 
+
    Computes to precision eps, via a fast algorithm, one or more transforms of the form:
- 
+
       c[j] = SUM   f[k1] exp(+/-i k1 x[j])      for j = 0,...,M-1
-              k1 
+              k1
       where the sum is over integers -N1/2 <= k1 <= (N1-1)/2.
- 
+
    Inputs:
      ntr    how many transforms (only for vectorized "many" functions, else ntr=1)
      M      number of nonuniform point targets
@@ -70,38 +70,38 @@
      N1     number of input Fourier modes
      f      Fourier mode coefficients (size N1*ntr complex array)
      opts   pointer to options struct (see opts.rst), or NULL for defaults
- 
+
    Outputs:
      c      values at nonuniform point targets (size M*ntr complex array)
      return value  0: success, 1: success but warning, >1: error (see error.rst)
- 
+
    Notes:
      * complex arrays interleave Re, Im values, and their size is stated with
        dimensions ordered fastest to slowest.
      * Fourier frequency indices in each dimension i are the integers lying
        in [-Ni/2, (Ni-1)/2]. See above, and modeord in opts.rst for possible orderings.
- 
- 
+
+
 ::
- 
- int finufft1d3(int64_t M, double* x, complex<double>* c, int iflag, double eps, int64_t 
+
+ int finufft1d3(int64_t M, double* x, complex<double>* c, int iflag, double eps, int64_t
  N, double* s, complex<double>* f, finufft_opts* opts)
- int finufftf1d3(int64_t M, float* x, complex<float>* c, int iflag, float eps, int64_t N, 
+ int finufftf1d3(int64_t M, float* x, complex<float>* c, int iflag, float eps, int64_t N,
  float* s, complex<float>* f, finufftf_opts* opts)
- 
- int finufft1d3many(int ntr, int64_t M, double* x, complex<double>* c, int iflag, double 
+
+ int finufft1d3many(int ntr, int64_t M, double* x, complex<double>* c, int iflag, double
  eps, int64_t N, double* s, complex<double>* f, finufft_opts* opts)
- int finufftf1d3many(int ntr, int64_t M, float* x, complex<float>* c, int iflag, float 
+ int finufftf1d3many(int ntr, int64_t M, float* x, complex<float>* c, int iflag, float
  eps, int64_t N, float* s, complex<float>* f, finufftf_opts* opts)
- 
+
    1D complex nonuniform FFT of type 3 (nonuniform to nonuniform).
- 
+
    Computes to precision eps, via a fast algorithm, one or more transforms of the form:
- 
+
                M-1
       f[k]  =  SUM   c[j] exp(+-i s[k] x[j]),      for k = 0,...,N-1
                j=0
- 
+
    Inputs:
      ntr    how many transforms (only for vectorized "many" functions, else ntr=1)
      M      number of nonuniform point sources
@@ -113,11 +113,11 @@
      N      number of nonuniform frequency targets
      s      nonuniform frequency targets in R (length N real array)
      opts   pointer to options struct (see opts.rst), or NULL for defaults
- 
+
    Outputs:
      f     Fourier transform values at targets (size N*ntr complex array)
      return value  0: success, 1: success but warning, >1: error (see error.rst)
- 
+
    Notes:
      * complex arrays interleave Re, Im values, and their size is stated with
        dimensions ordered fastest to slowest.
diff --git a/docs/c1d.docsrc b/docs/c1d.docsrc
index e8d563618..c1ba17fe3 100644
--- a/docs/c1d.docsrc
+++ b/docs/c1d.docsrc
@@ -4,7 +4,7 @@ int @F1d1(int64_t M, double* x, complex<double>* c, int iflag, double eps, int64
 @t
               M-1
      f[k1] =  SUM c[j] exp(+/-i k1 x(j))  for -N1/2 <= k1 <= (N1-1)/2
-              j=0            
+              j=0
 
   Inputs:
 @nt
@@ -28,7 +28,7 @@ int @F1d2(int64_t M, double* x, complex<double>* c, int iflag, double eps, int64
   1D complex nonuniform FFT of type 2 (uniform to nonuniform).
 @t
      c[j] = SUM   f[k1] exp(+/-i k1 x[j])      for j = 0,...,M-1
-             k1 
+             k1
      where the sum is over integers -N1/2 <= k1 <= (N1-1)/2.
 
   Inputs:
diff --git a/docs/c2d.doc b/docs/c2d.doc
index d9275bd95..aa0ee6379 100644
--- a/docs/c2d.doc
+++ b/docs/c2d.doc
@@ -1,25 +1,25 @@
 ::
- 
- int finufft2d1(int64_t M, double* x, double* y, complex<double>* c, int iflag, double 
+
+ int finufft2d1(int64_t M, double* x, double* y, complex<double>* c, int iflag, double
  eps, int64_t N1, int64_t N2, complex<double>* f, finufft_opts* opts)
- int finufftf2d1(int64_t M, float* x, float* y, complex<float>* c, int iflag, float eps, 
+ int finufftf2d1(int64_t M, float* x, float* y, complex<float>* c, int iflag, float eps,
  int64_t N1, int64_t N2, complex<float>* f, finufftf_opts* opts)
- 
- int finufft2d1many(int ntr, int64_t M, double* x, double* y, complex<double>* c, int 
+
+ int finufft2d1many(int ntr, int64_t M, double* x, double* y, complex<double>* c, int
  iflag, double eps, int64_t N1, int64_t N2, complex<double>* f, finufft_opts* opts)
- int finufftf2d1many(int ntr, int64_t M, float* x, float* y, complex<float>* c, int iflag, 
+ int finufftf2d1many(int ntr, int64_t M, float* x, float* y, complex<float>* c, int iflag,
  float eps, int64_t N1, int64_t N2, complex<float>* f, finufftf_opts* opts)
- 
+
    2D complex nonuniform FFT of type 1 (nonuniform to uniform).
- 
+
    Computes to precision eps, via a fast algorithm, one or more transforms of the form:
- 
+
                   M-1
       f[k1,k2] =  SUM c[j] exp(+/-i (k1 x[j] + k2 y[j]))
                   j=0
- 
+
                for -N1/2 <= k1 <= (N1-1)/2,      -N2/2 <= k2 <= (N2-1)/2.
- 
+
    Inputs:
      ntr    how many transforms (only for vectorized "many" functions, else ntr=1)
      M      number of nonuniform point sources
@@ -31,33 +31,33 @@
      N1     number of output Fourier modes to be computed (x direction)
      N2     number of output Fourier modes to be computed (y direction)
      opts   pointer to options struct (see opts.rst), or NULL for defaults
- 
+
    Outputs:
      f      Fourier mode coefficients (size N1*N2*ntr complex array)
      return value  0: success, 1: success but warning, >1: error (see error.rst)
- 
+
    Notes:
      * complex arrays interleave Re, Im values, and their size is stated with
        dimensions ordered fastest to slowest.
      * Fourier frequency indices in each dimension i are the integers lying
        in [-Ni/2, (Ni-1)/2]. See above, and modeord in opts.rst for possible orderings.
- 
+
 ::
- 
- int finufft2d2(int64_t M, double* x, double* y, complex<double>* c, int iflag, double 
+
+ int finufft2d2(int64_t M, double* x, double* y, complex<double>* c, int iflag, double
  eps, int64_t N1, int64_t N2, complex<double>* f, finufft_opts* opts)
- int finufftf2d2(int64_t M, float* x, float* y, complex<float>* c, int iflag, float eps, 
+ int finufftf2d2(int64_t M, float* x, float* y, complex<float>* c, int iflag, float eps,
  int64_t N1, int64_t N2, complex<float>* f, finufftf_opts* opts)
- 
- int finufft2d2many(int ntr, int64_t M, double* x, double* y, complex<double>* c, int 
+
+ int finufft2d2many(int ntr, int64_t M, double* x, double* y, complex<double>* c, int
  iflag, double eps, int64_t N1, int64_t N2, complex<double>* f, finufft_opts* opts)
- int finufftf2d2many(int ntr, int64_t M, float* x, float* y, complex<float>* c, int iflag, 
+ int finufftf2d2many(int ntr, int64_t M, float* x, float* y, complex<float>* c, int iflag,
  float eps, int64_t N1, int64_t N2, complex<float>* f, finufftf_opts* opts)
- 
+
    2D complex nonuniform FFT of type 2 (uniform to nonuniform).
- 
+
    Computes to precision eps, via a fast algorithm, one or more transforms of the form:
- 
+
       c[j] =   SUM   f[k1,k2] exp(+/-i (k1 x[j] + k2 y[j]))    for j = 0,...,M-1
               k1,k2
       where the sum is over integers -N1/2 <= k1 <= (N1-1)/2,
@@ -73,38 +73,38 @@
      N2     number of input Fourier modes (y direction)
      f      Fourier mode coefficients (size N1*N2*ntr complex array)
      opts   pointer to options struct (see opts.rst), or NULL for defaults
- 
+
    Outputs:
      c      values at nonuniform point targets (size M*ntr complex array)
      return value  0: success, 1: success but warning, >1: error (see error.rst)
- 
+
    Notes:
      * complex arrays interleave Re, Im values, and their size is stated with
        dimensions ordered fastest to slowest.
      * Fourier frequency indices in each dimension i are the integers lying
        in [-Ni/2, (Ni-1)/2]. See above, and modeord in opts.rst for possible orderings.
- 
+
 ::
- 
- int finufft2d3(int64_t M, double* x, double* y, complex<double>* c, int iflag, double 
+
+ int finufft2d3(int64_t M, double* x, double* y, complex<double>* c, int iflag, double
  eps, int64_t N, double* s, double* t, complex<double>* f, finufft_opts* opts)
- int finufftf2d3(int64_t M, float* x, float* y, complex<float>* c, int iflag, float eps, 
+ int finufftf2d3(int64_t M, float* x, float* y, complex<float>* c, int iflag, float eps,
  int64_t N, float* s, float* t, complex<float>* f, finufftf_opts* opts)
- 
- int finufft2d3many(int ntr, int64_t M, double* x, double* y, complex<double>* c, int 
- iflag, double eps, int64_t N, double* s, double* t, complex<double>* f, finufft_opts* 
+
+ int finufft2d3many(int ntr, int64_t M, double* x, double* y, complex<double>* c, int
+ iflag, double eps, int64_t N, double* s, double* t, complex<double>* f, finufft_opts*
  opts)
- int finufftf2d3many(int ntr, int64_t M, float* x, float* y, complex<float>* c, int iflag, 
+ int finufftf2d3many(int ntr, int64_t M, float* x, float* y, complex<float>* c, int iflag,
  float eps, int64_t N, float* s, float* t, complex<float>* f, finufftf_opts* opts)
- 
+
    2D complex nonuniform FFT of type 3 (nonuniform to nonuniform).
- 
+
    Computes to precision eps, via a fast algorithm, one or more transforms of the form:
- 
+
                M-1
       f[k]  =  SUM  c[j] exp(+-i (s[k] x[j] + t[k] y[j])),   for k = 0,...,N-1
                j=0
- 
+
    Inputs:
      ntr    how many transforms (only for vectorized "many" functions, else ntr=1)
      M      number of nonuniform point sources
@@ -116,11 +116,11 @@
      N      number of nonuniform frequency targets
      s,t    nonuniform frequency target coordinates in R^2 (length N real arrays)
      opts   pointer to options struct (see opts.rst), or NULL for defaults
- 
+
    Outputs:
      f     Fourier transform values at targets (size N*ntr complex array)
      return value  0: success, 1: success but warning, >1: error (see error.rst)
- 
+
    Notes:
      * complex arrays interleave Re, Im values, and their size is stated with
        dimensions ordered fastest to slowest.
diff --git a/docs/c3d.doc b/docs/c3d.doc
index d333d8190..9d6e50877 100644
--- a/docs/c3d.doc
+++ b/docs/c3d.doc
@@ -1,27 +1,27 @@
 ::
- 
- int finufft3d1(int64_t M, double* x, double* y, double* z, complex<double>* c, int iflag, 
+
+ int finufft3d1(int64_t M, double* x, double* y, double* z, complex<double>* c, int iflag,
  double eps, int64_t N1, int64_t N2, int64_t N3, complex<double>* f, finufft_opts* opts)
- int finufftf3d1(int64_t M, float* x, float* y, float* z, complex<float>* c, int iflag, 
+ int finufftf3d1(int64_t M, float* x, float* y, float* z, complex<float>* c, int iflag,
  float eps, int64_t N1, int64_t N2, int64_t N3, complex<float>* f, finufftf_opts* opts)
- 
- int finufft3d1many(int ntr, int64_t M, double* x, double* y, double* z, complex<double>* 
- c, int iflag, double eps, int64_t N1, int64_t N2, int64_t N3, complex<double>* f, 
+
+ int finufft3d1many(int ntr, int64_t M, double* x, double* y, double* z, complex<double>*
+ c, int iflag, double eps, int64_t N1, int64_t N2, int64_t N3, complex<double>* f,
  finufft_opts* opts)
- int finufftf3d1many(int ntr, int64_t M, float* x, float* y, float* z, complex<float>* c, 
- int iflag, float eps, int64_t N1, int64_t N2, int64_t N3, complex<float>* f, 
+ int finufftf3d1many(int ntr, int64_t M, float* x, float* y, float* z, complex<float>* c,
+ int iflag, float eps, int64_t N1, int64_t N2, int64_t N3, complex<float>* f,
  finufftf_opts* opts)
- 
+
    3D complex nonuniform FFT of type 1 (nonuniform to uniform).
- 
+
    Computes to precision eps, via a fast algorithm, one or more transforms of the form:
- 
+
                   M-1
       f[k1,k2] =  SUM c[j] exp(+/-i (k1 x[j] + k2 y[j] + k3 z[j]))
                   j=0
- 
+
   for -N1/2 <= k1 <= (N1-1)/2,  -N2/2 <= k2 <= (N2-1)/2,  -N3/2 <= k3 <= (N3-1)/2
- 
+
    Inputs:
      ntr    how many transforms (only for vectorized "many" functions, else ntr=1)
      M      number of nonuniform point sources
@@ -34,39 +34,39 @@
      N2     number of output Fourier modes to be computed (y direction)
      N3     number of output Fourier modes to be computed (z direction)
      opts   pointer to options struct (see opts.rst), or NULL for defaults
- 
+
    Outputs:
      f      Fourier mode coefficients (size N1*N2*N3*ntr complex array)
      return value  0: success, 1: success but warning, >1: error (see error.rst)
- 
+
    Notes:
      * complex arrays interleave Re, Im values, and their size is stated with
        dimensions ordered fastest to slowest.
      * Fourier frequency indices in each dimension i are the integers lying
        in [-Ni/2, (Ni-1)/2]. See above, and modeord in opts.rst for possible orderings.
- 
- 
+
+
 ::
- 
- int finufft3d2(int64_t M, double* x, double* y, double* z, complex<double>* c, int iflag, 
+
+ int finufft3d2(int64_t M, double* x, double* y, double* z, complex<double>* c, int iflag,
  double eps, int64_t N1, int64_t N2, int64_t N3, complex<double>* f, finufft_opts* opts)
- int finufftf3d2(int64_t M, float* x, float* y, float* z, complex<float>* c, int iflag, 
+ int finufftf3d2(int64_t M, float* x, float* y, float* z, complex<float>* c, int iflag,
  float eps, int64_t N1, int64_t N2, int64_t N3, complex<float>* f, finufftf_opts* opts)
- 
- int finufft3d2many(int ntr, int64_t M, double* x, double* y, double* z, complex<double>* 
- c, int iflag, double eps, int64_t N1, int64_t N2, int64_t N3, complex<double>* f, 
+
+ int finufft3d2many(int ntr, int64_t M, double* x, double* y, double* z, complex<double>*
+ c, int iflag, double eps, int64_t N1, int64_t N2, int64_t N3, complex<double>* f,
  finufft_opts* opts)
- int finufftf3d2many(int ntr, int64_t M, float* x, float* y, float* z, complex<float>* c, 
- int iflag, float eps, int64_t N1, int64_t N2, int64_t N3, complex<float>* f, 
+ int finufftf3d2many(int ntr, int64_t M, float* x, float* y, float* z, complex<float>* c,
+ int iflag, float eps, int64_t N1, int64_t N2, int64_t N3, complex<float>* f,
  finufftf_opts* opts)
- 
+
    3D complex nonuniform FFT of type 2 (uniform to nonuniform).
- 
+
    Computes to precision eps, via a fast algorithm, one or more transforms of the form:
- 
+
       c[j] =   SUM    f[k1,k2,k3] exp(+/-i (k1 x[j] + k2 y[j] + k3 z[j]))
              k1,k2,k3
- 
+
       for j = 0,...,M-1,
       where the sum is over integers -N1/2 <= k1 <= (N1-1)/2,
                                      -N2/2 <= k2 <= (N2-1)/2,
@@ -83,38 +83,38 @@
      N3     number of input Fourier modes (z direction)
      f      Fourier mode coefficients (size N1*N2*N3*ntr complex array)
      opts   pointer to options struct (see opts.rst), or NULL for defaults
- 
+
    Outputs:
      c      values at nonuniform point targets (size M*ntr complex array)
      return value  0: success, 1: success but warning, >1: error (see error.rst)
- 
+
    Notes:
      * complex arrays interleave Re, Im values, and their size is stated with
        dimensions ordered fastest to slowest.
      * Fourier frequency indices in each dimension i are the integers lying
        in [-Ni/2, (Ni-1)/2]. See above, and modeord in opts.rst for possible orderings.
- 
- 
+
+
 ::
- 
- int finufft3d3(int64_t M, double* x, double* y, double* z, complex<double>* c, int iflag, 
- double eps, int64_t N, double* s, double* t, double* u, complex<double>* f, finufft_opts* 
+
+ int finufft3d3(int64_t M, double* x, double* y, double* z, complex<double>* c, int iflag,
+ double eps, int64_t N, double* s, double* t, double* u, complex<double>* f, finufft_opts*
  opts)
- int finufftf3d3(int64_t M, float* x, float* y, float* z, complex<float>* c, int iflag, 
- float eps, int64_t N, float* s, float* t, float* u, complex<float>* f, finufftf_opts* 
+ int finufftf3d3(int64_t M, float* x, float* y, float* z, complex<float>* c, int iflag,
+ float eps, int64_t N, float* s, float* t, float* u, complex<float>* f, finufftf_opts*
  opts)
- 
- int finufft3d3many(int ntr, int64_t M, double* x, double* y, double* z, complex<double>* 
- c, int iflag, double eps, int64_t N, double* s, double* t, double* u, complex<double>* f, 
+
+ int finufft3d3many(int ntr, int64_t M, double* x, double* y, double* z, complex<double>*
+ c, int iflag, double eps, int64_t N, double* s, double* t, double* u, complex<double>* f,
  finufft_opts* opts)
- int finufftf3d3many(int ntr, int64_t M, float* x, float* y, float* z, complex<float>* c, 
- int iflag, float eps, int64_t N, float* s, float* t, float* u, complex<float>* f, 
+ int finufftf3d3many(int ntr, int64_t M, float* x, float* y, float* z, complex<float>* c,
+ int iflag, float eps, int64_t N, float* s, float* t, float* u, complex<float>* f,
  finufftf_opts* opts)
- 
+
    3D complex nonuniform FFT of type 3 (nonuniform to nonuniform).
- 
+
    Computes to precision eps, via a fast algorithm, one or more transforms of the form:
- 
+
                M-1
       f[k]  =  SUM  c[j] exp(+-i (s[k] x[j] + t[k] y[j] + u[k] z[j])),
                j=0
@@ -130,7 +130,11 @@
      N      number of nonuniform frequency targets
      s,t,u  nonuniform frequency target coordinates in R^3 (length N real arrays)
      opts   pointer to options struct (see opts.rst), or NULL for defaults
- 
+
    Outputs:
      f     Fourier transform values at targets (size N*ntr complex array)
      return value  0: success, 1: success but warning, >1: error (see error.rst)
+
+   Notes:
+     * complex arrays interleave Re, Im values, and their size is stated with
+       dimensions ordered fastest to slowest.
diff --git a/docs/c3d.docsrc b/docs/c3d.docsrc
index 006f90722..f15fc74d7 100644
--- a/docs/c3d.docsrc
+++ b/docs/c3d.docsrc
@@ -79,4 +79,4 @@ int @F3d3(int64_t M, double* x, double* y, double* z, complex<double>* c, int if
   Outputs:
     f     Fourier transform values at targets (size N*ntr complex array)
 @r
-@no
\ No newline at end of file
+@no
diff --git a/docs/cguru.doc b/docs/cguru.doc
index f21db430b..16c2f8485 100644
--- a/docs/cguru.doc
+++ b/docs/cguru.doc
@@ -1,16 +1,16 @@
 ::
- 
- int finufft_makeplan(int type, int dim, int64_t* nmodes, int iflag, int ntr, double eps, 
+
+ int finufft_makeplan(int type, int dim, int64_t* nmodes, int iflag, int ntr, double eps,
  finufft_plan* plan, finufft_opts* opts)
- int finufftf_makeplan(int type, int dim, int64_t* nmodes, int iflag, int ntr, float eps, 
+ int finufftf_makeplan(int type, int dim, int64_t* nmodes, int iflag, int ntr, float eps,
  finufftf_plan* plan, finufftf_opts* opts)
- 
+
    Make a plan to perform one or more general transforms.
- 
+
    Under the hood, for type 1 and 2, this does FFTW planning and kernel Fourier
    transform precomputation. For type 3, this does very little, since the FFT
    sizes are not yet known.
- 
+
  Inputs:
       type   type of transform (1,2, or 3)
       dim    spatial dimension (1,2, or 3)
@@ -22,36 +22,36 @@
      eps    desired relative precision; smaller is slower. This can be chosen
             from 1e-1 down to ~ 1e-14 (in double precision) or 1e-6 (in single)
      opts   pointer to options struct (see opts.rst), or NULL for defaults
- 
+
    Outputs:
       plan   plan object (under the hood this is a pointer to another struct)
      return value  0: success, 1: success but warning, >1: error (see error.rst)
- 
+
    Notes:
       * All available threads are planned by default (but see opts.nthreads)
       * The vectorized (many vector) plan, ie ntrans>1, can be much faster
         than repeated calls with the same nonuniform points.
         Note that here the I/O data ordering is stacked rather than interleaved.
       * For more details about the fields in the opts pointer, see opts.rst
- 
- 
+
+
 ::
- 
- int finufft_setpts(finufft_plan plan, int64_t M, double* x, double* y, double* z, int64_t 
+
+ int finufft_setpts(finufft_plan plan, int64_t M, double* x, double* y, double* z, int64_t
  N, double* s, double* t, double* z)
- int finufftf_setpts(finufftf_plan plan, int64_t M, float* x, float* y, float* z, int64_t 
+ int finufftf_setpts(finufftf_plan plan, int64_t M, float* x, float* y, float* z, int64_t
  N, float* s, float* t, float* z)
- 
+
    Input nonuniform points with coordinates x (and possibly y, and possibly z),
    and, if type 3, nonuniform frequency target coordinates s (and possibly t,
    and possibly u), into an existing plan. If type is 1 or 2 then the last four
    arguments are ignored. Unused dimensions are ignored.
- 
+
    Under the hood, for type 1 or 2, this routine bin-sorts the points (storing
    just the permutation rather than new copies of the coordinates). For type
    3 it also bin-sorts the frequencies, chooses two levels of grid sizes, then
    plans the inner type 2 call (interpolation and FFTW).
- 
+
    Inputs:
       M      number of nonuniform spatial points (used by all types)
       x      nonuniform point x-coordinates (length M real array)
@@ -66,13 +66,13 @@
              ignored otherwise
       u      if dim>2, nonuniform frequency z-coordinates (length N real array),
              ignored otherwise
- 
+
    Input/Outputs:
       plan   plan object
- 
+
    Outputs:
      return value  0: success, 1: success but warning, >1: error (see error.rst)
- 
+
    Notes:
      * The coordinates in x (and if nonempty, y and z) can be any real numbers.
        For type 1 and 2 transforms, their definitions imply that that the
@@ -87,22 +87,22 @@
        sizes is not too large (it controls the internal fine grid size).
      * The coordinates pointed to by any used arrays x, y, z, s, t, u must
        not be changed between this call and the below execute call!
- 
- 
+
+
 ::
- 
+
  int finufft_execute(finufft_plan plan, complex<double>* c, complex<double>* f)
  int finufftf_execute(finufftf_plan plan, complex<float>* c, complex<float>* f)
- 
+
    Perform one or more NUFFT transforms using previously entered nonuniform
    points and an existing plan. To summarize, this maps
      type 1: c -> f
-     type 2: f -> c      
+     type 2: f -> c
      type 3: c -> f
- 
+
    Inputs:
         plan   plan object
- 
+
    Input/Outputs:
         c      For types 1 and 3, the input strengths at the nonuniform point
                sources (size M*ntr complex array).
@@ -116,28 +116,28 @@
                respectively).
                If type 3, the output values at the nonuniform frequency targets
                (size N*ntr complex array).
-               
+
    Outputs:
      return value  0: success, 1: success but warning, >1: error (see error.rst)
- 
+
    Notes:
      * The contents of the arrays x, y, z, s, t, u must not have changed since
        the finufft_setpts call that read them. The execution rereads them
        (this way of doing business saves RAM).
      * f and c are contiguous Fortran-style arrays with the transform number,
        if ntr>1, being the "slowest" (outer) dimension.
- 
- 
+
+
 ::
- 
+
  int finufft_destroy(finufft_plan plan)
  int finufftf_destroy(finufftf_plan plan)
- 
+
    Deallocate a plan object. This must be used upon clean-up, or before reusing
    a plan in another call to finufft_makeplan.
- 
+
    Inputs/Outputs:
         plan   plan object
- 
+
    Outputs:
      return value  0: success, 1: success but warning, >1: error (see error.rst)
diff --git a/docs/cguru.docsrc b/docs/cguru.docsrc
index 49ab62df3..7747aab7f 100644
--- a/docs/cguru.docsrc
+++ b/docs/cguru.docsrc
@@ -83,7 +83,7 @@ int @G_execute(finufft_plan plan, complex<double>* c, complex<double>* f)
   Perform one or more NUFFT transforms using previously entered nonuniform
   points and an existing plan. To summarize, this maps
     type 1: c -> f
-    type 2: f -> c      
+    type 2: f -> c
     type 3: c -> f
 
   Inputs:
@@ -102,7 +102,7 @@ int @G_execute(finufft_plan plan, complex<double>* c, complex<double>* f)
               respectively).
               If type 3, the output values at the nonuniform frequency targets
               (size N*ntr complex array).
-              
+
   Outputs:
 @r
 
diff --git a/docs/install_gpu.rst b/docs/install_gpu.rst
index 360543ac1..ec6a74b60 100644
--- a/docs/install_gpu.rst
+++ b/docs/install_gpu.rst
@@ -39,12 +39,12 @@ In order to configure cuFINUFFT for a specific compute capability, use the ``CMA
 To find out your own device's compute capability without having to look it up on the web, use:
 
 .. code-block:: bash
-                
+
     nvidia-smi --query-gpu=compute_cap --format=csv,noheader
 
 This will return a text string such as ``8.6`` which would incidate
 ``sm_86`` architecture, thus to use ``CMAKE_CUDA_ARCHITECTURES=86``.
-    
+
 
 Testing
 -------
diff --git a/docs/latexindex.rst b/docs/latexindex.rst
index b98e67544..87ff7ebeb 100644
--- a/docs/latexindex.rst
+++ b/docs/latexindex.rst
@@ -8,7 +8,7 @@
 
 :orphan:
 
-   
+
 Flatiron Institute Nonuniform Fast Fourier Transform
 =====================================================
 
@@ -16,23 +16,23 @@ Flatiron Institute Nonuniform Fast Fourier Transform
    numbering. Note that ch.1 is now the overview (unlike in index.rst):
 
 .. toctree::
-	   
+
    overview
    install
    dirs
    math
-   cex           
+   cex
    c
    opts
    error
    trouble
    tut
-   fortran          
+   fortran
    matlab
    python
    julia
    changelog
-   devnotes           
+   devnotes
    related
    users
    ackn
diff --git a/docs/makefile.doc b/docs/makefile.doc
index 75684bde0..711eddae6 100644
--- a/docs/makefile.doc
+++ b/docs/makefile.doc
@@ -1,4 +1,5 @@
-Makefile for FINUFFT library. Please specify your task:
+make[1]: Entering directory '/home/marco/repos/finufft'
+Makefile for FINUFFT CPU library. Please specify your task:
  make lib - build the main library (in lib/ and lib-static/)
  make examples - compile and run all codes in examples/
  make test - compile and run quick math validation tests
@@ -22,3 +23,4 @@ Make options:
  You must at least 'make objclean' before changing such options!
 
 Also see docs/install.rst and docs/README
+make[1]: Leaving directory '/home/marco/repos/finufft'
diff --git a/docs/math.rst b/docs/math.rst
index 89c3356ba..c535c985c 100644
--- a/docs/math.rst
+++ b/docs/math.rst
@@ -15,7 +15,7 @@ Type 1 and type 2 involve the Fourier "modes" (Fourier series coefficients)
 with integer frequency indices lying in the Cartesian product set
 
 .. math::
-   
+
    K = K_{N_1,\dots,N_d} := K_{N_1} \times K_{N_2} \times \dots \times K_{N_d}~,
 
 where
@@ -40,21 +40,21 @@ Then the **type 1** (nonuniform to uniform, aka "adjoint") NUFFT evaluates
 
 .. math::
   :label: 1
-   
+
   f_\mathbf{k} := \sum_{j=1}^M c_j e^{\pm i \mathbf{k}\cdot \mathbf{x}_j}
   \qquad \mbox{for } \mathbf{k} \in K
 
-  
+
 This can be viewed as evaluating a set of
 Fourier series coefficients due to sources
-with strengths $c_j$ at the arbitrary locations $\mathbf{x}_j$.	  
+with strengths $c_j$ at the arbitrary locations $\mathbf{x}_j$.
 Either sign of the imaginary unit in the exponential can be chosen in the interface. Note that our normalization differs from that of references [DR,GL].
 
 The **type 2** (U to NU, aka "forward") NUFFT evaluates
 
 .. math::
    :label: 2
-	   
+
    c_j := \sum_{\mathbf{k}\in K} f_\mathbf{k} e^{\pm i \mathbf{k}\cdot \mathbf{x}_j}
    \qquad \mbox{for } j=1,\ldots,M
 
@@ -83,7 +83,7 @@ Then the type 3 transform evaluates:
 
 .. math::
   :label: 3
-   
+
   f_k := \sum_{j=1}^M c_j e^{\pm i \mathbf{s}_k\cdot \mathbf{x}_j}
    \qquad \mbox{for } k=1,\ldots,N
 
diff --git a/docs/matlab.rst b/docs/matlab.rst
index ecd05f141..efede7970 100644
--- a/docs/matlab.rst
+++ b/docs/matlab.rst
@@ -32,7 +32,7 @@ their defaults, for instance:
 
 .. code-block:: matlab
 
-  o.modeord = 1;                      % choose FFT-style output mode ordering  
+  o.modeord = 1;                      % choose FFT-style output mode ordering
   f = finufft1d1(x,c,+1,1e-12,N,o);   % do it
 
 The above usage we call the "simple" interface. There is also a "vectorized"
@@ -75,7 +75,7 @@ Here we use the guru interface to repeat the first demo above:
   delete(plan);                       % don't forget to clean up
 
 .. warning::
-     
+
    If an existing array is passed to ``setpts``, then this array must not be altered before ``execute`` is called! This is because, in order to save RAM (allowing larger problems to be solved), internally FINUFFT stores only *pointers* to ``x`` (etc), rather than unnecessarily duplicating this data. This is not true if an *expression* such as ``-x`` or ``2*pi*rand(M,1)`` is passed to ``setpts``, since in those cases the ``plan`` object does make internal copies, as per MATLAB's usual shallow-copy argument passing.
 
 Finally, we demo a 2D type 1 transform using the simple interface. Let's
@@ -124,5 +124,5 @@ MATLAB path via something like ``addpath FINUFFT/matlab``, then
 .. literalinclude:: ../matlab/Contents.m
 
 The individual commands have the following help documentation:
-                    
+
 .. include:: matlabhelp.doc
diff --git a/docs/nfft_migr.rst b/docs/nfft_migr.rst
index 114807b1a..f7616447a 100644
--- a/docs/nfft_migr.rst
+++ b/docs/nfft_migr.rst
@@ -18,7 +18,7 @@ Also of interest (but not yet demonstrated below) is:
 
  * the forward NFFT transform (a.k.a. type 2)
  * the nonuniform to nonuniform NNFFT (a.k.a. type 3)
-   
+
  .. note:: The NFFT3 library can do more things---real-valued data, sphere, rotation group, hyperbolic cross, inverse transforms---none of which FINUFFT can yet do directly (although our three transforms can be used as components in such tasks). We do not address those here.
 
 Migrating a 2D adjoint transform (type 1) in C from NFFT3 to FINUFFT
diff --git a/docs/refs.rst b/docs/refs.rst
index 11663596e..38eb4b37f 100644
--- a/docs/refs.rst
+++ b/docs/refs.rst
@@ -83,4 +83,3 @@ These
 may be a useful introduction to FINUFFT and applications.
 
 Yu-Hsuan (Melody) Shih's PDSEC2021 20-minute presentation video on cuFINUFFT is here: https://www.youtube.com/watch?v=PnW6ehMyHxM
-
diff --git a/docs/tut.rst b/docs/tut.rst
index 57af9f67d..ed329707e 100644
--- a/docs/tut.rst
+++ b/docs/tut.rst
@@ -27,8 +27,7 @@ For further applications, see :ref:`references <refs>`, and:
 * `Fast Fresnel diffraction <https://github.com/ahbarnett/fresnaq>`_ for optics and acoustics applications.
 
 * `Equispaced Fourier methods for Gaussian process regression <https://github.com/flatironinstitute/gp-shootout>`_ as described in https://arxiv.org/abs/2210.10210 and https://arxiv.org/abs/2305.11065
-  
+
 * Tutorials for PyNUFFT with 1D and 2D reconstruction examples `here <http://jyhmiinlin.github.io/pynufft/tutor/init.html>`_.
-     
-* The numerical sampling of `random plane waves <https://users.flatironinstitute.org/~ahb/rpws/>`_.
 
+* The numerical sampling of `random plane waves <https://users.flatironinstitute.org/~ahb/rpws/>`_.
diff --git a/docs/tutorial/contft.rst b/docs/tutorial/contft.rst
index 5574420f5..f00b2468b 100644
--- a/docs/tutorial/contft.rst
+++ b/docs/tutorial/contft.rst
@@ -72,7 +72,7 @@ Notice :eq:`fint` is simply a type 3 NUFFT with strengths $c_j = f(x_j) w_j$,
 so we evaluate it by calling FINUFFT (this takes 0.1 sec) then plot the resulting FT at its target $k$ points:
 
 .. code-block:: matlab
-           
+
   tol = 1e-10;
   fhat = finufft1d3(xj, f(xj).*wj, +1, tol, k);
   plot(k, [real(fhat),imag(fhat)], '.');
@@ -152,7 +152,7 @@ you will have to shift (by it to this specific offset, then post-multiply
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 The above $f$ was merely discontinuous. But you can now go further and easily
-replace $(x_j,w_j)$ by a rule that is accurate for 
+replace $(x_j,w_j)$ by a rule that is accurate for
 a function with known singularities. Eg, say $f(x) = x^{-1/2}g(x)$ where $g$ is smooth
 on $[0,1]$, then the change of variable $x=y^2$ means
 that $\int_0^1 f(x) dx = \int_0^1 2y f(y) dy$, the latter having a
@@ -182,7 +182,7 @@ This exhibits rapid convergence kinking in at a slightly higher $N$, while $\hat
 now has even slower decay (which one can check is $|k|^{-1/2}$):
 
 .. code-block:: matlab
-                
+
                        180         0.208975054515039
                        200      3.04233050928417e-05
                        220       1.9202016281569e-10
@@ -221,7 +221,7 @@ Since $g$ is smooth, this will have spectral convergence in $n$ and $m$.
 Here is a fresh code to make this quadrature over $\Omega$:
 
 .. code-block:: matlab
-                
+
  g = @(t) 1 + 0.3*cos(3*t);                             % boundary shape
  n = 280;                                               % # theta nodes
  t = 2*pi*(1:n)/n; wt = (2*pi/n);                       % theta nodes, const weights
@@ -235,7 +235,7 @@ Here is a fresh code to make this quadrature over $\Omega$:
    wj(jj) = wt*r^2*xr.*wr;            % theta weight times rule for r.dr on (0,r)
  end
  plot([bx bx(1)],[by by(1)],'-'); hold on; plot(xj,yj,'.');
-                
+
 .. image:: ../pics/contft2dnodes.png
    :width: 50%
 
@@ -257,7 +257,7 @@ become the source strengths. We also image the result on a log scale:
 
 .. image:: ../pics/contft2dans.png
    :width: 70%
-             
+
 Thus we have computed the 2D FT of a discontinous function on a million-point grid to around 10-digit accuracy in 0.05 sec (the FINUFFT transform time).
 Note that, as with 1D discontinuous functions, the decay with $k:=|{\bf k}|$ is slow (it is like $1/k$).
 See the full code `tutorial/contft2d.m <https://github.com/flatironinstitute/finufft/blob/master/tutorial/contft2d.m>`_ also for the study that shows that, for the above ``kmax``, convergence to the tolerance has occurred by ``n=280`` and ``m=70``, needing $N=19600$ nodes. A more efficient set would vary ``m`` with $\theta$.
@@ -271,7 +271,7 @@ See the full code `tutorial/contft2d.m <https://github.com/flatironinstitute/fin
    If a downstream planar detector is very distant (the Fraunhofer diffraction limit),
    and the angles of scattering are small,
    then $|\hat{f}|^2$ is a good model for the detected scattered intensity.
-                
+
 
 Further reading
 ~~~~~~~~~~~~~~~~
@@ -289,4 +289,3 @@ be on a regular grid. They also can be useful for known singularities
 Kirchhoff approximation and Fraunhofer diffraction in optics:
 
 * M. Born and E. Wolf, *Principles of Optics*, 6th edition. Section 8.3.
-
diff --git a/docs/tutorial/grf.rst b/docs/tutorial/grf.rst
index 1bccf0026..caaee6aad 100644
--- a/docs/tutorial/grf.rst
+++ b/docs/tutorial/grf.rst
@@ -10,7 +10,7 @@ invariant.
 It is also known as a Gaussian process prior.
 Given a GRF model,
 it is important to be able to 1) draw independent samples from the GRF
-at a given set of target points, 
+at a given set of target points,
 and 2) compute the log likelihood (under the model) of a given function values at a given set of target points.
 We focus on the first application. (For the second, see reference at the
 bottom of this page.)
@@ -53,7 +53,7 @@ For background on Gaussian random fields, aka, Gaussian processes,
 see:
 
   * C E Rasmussen & C K I Williams, *Gaussian Processes for Machine Learning*, the MIT Press, 2006. http://www.GaussianProcess.org/gpml
-    
+
   * Fast Direct Methods for Gaussian Processes,
     Sivaram Ambikasaran, Daniel Foreman-Mackey, Leslie Greengard,
     David W. Hogg, and Michael O’Neil. arxiv:1403.6015 (2015)
diff --git a/docs/tutorial/peripois2d.rst b/docs/tutorial/peripois2d.rst
index 8797d08f0..164b9c44a 100644
--- a/docs/tutorial/peripois2d.rst
+++ b/docs/tutorial/peripois2d.rst
@@ -19,7 +19,7 @@ back on the original grid. Here is a MATLAB demo in $d=2$ dimensions.
 Firstly we set up a smooth function, periodic up to machine precision:
 
 .. code-block:: matlab
-  
+
   w0 = 0.1;  % width of bumps
   src = @(x,y) exp(-0.5*((x-1).^2+(y-2).^2)/w0^2)-exp(-0.5*((x-3).^2+(y-5).^2)/w0^2);
 
@@ -27,7 +27,7 @@ Now we do the FFT solve, using a loop to check convergence with respect to
 ``n`` the number of grid points in each dimension:
 
 .. code-block:: matlab
-  
+
   ns = 40:20:120;              % convergence study of grid points per side
   for i=1:numel(ns), n = ns(i);
     x = 2*pi*(0:n-1)/n;        % grid
@@ -55,7 +55,7 @@ We observe spectral convergence to 14 digits::
 Here we plot the FFT solution:
 
 .. code-block:: matlab
-  
+
   figure; subplot(1,2,1); imagesc(x,x,f'); colorbar('southoutside');
   axis xy equal tight; title('source term f'); xlabel('x'); ylabel('y');
   subplot(1,2,2); imagesc(x,x,u'); colorbar('southoutside');
@@ -63,7 +63,7 @@ Here we plot the FFT solution:
 
 .. image:: ../pics/pois_fft.png
    :width: 90%
-           
+
 Now let's say you wish to do a similar Poisson solve on a **non-Cartesian grid** covering the same
 domain. There are two cases: a) the grid is unstructured and you do
 not know the weights of a quadrature scheme, or b) you do know the weights
@@ -72,7 +72,7 @@ structured, such as arising from a different coordinate system or an adaptive su
 $j=1,\dots, M$, and
 weights $w_j$ such that, for all smooth functions $f$,
 
-.. math::   
+.. math::
   \int_{[0,2\pi)^d} f(x) dx \approx \sum_{j=1}^M f(x_j) w_j
 
 holds to sufficient accuracy. We consider case b) only.
@@ -82,7 +82,7 @@ diffeomorphism from $[0,2\pi)^2$ to itself to define a distorted mesh
 Jacobian):
 
 .. code-block:: matlab
-  
+
   map = @(t,s) [t + 0.5*sin(t) + 0.2*sin(2*s); s + 0.3*sin(2*s) + 0.3*sin(s-t)];
   mapJ = @(t,s) [1 + 0.5*cos(t), 0.4*cos(2*s); ...
                 -0.3*cos(s-t),  1+0.6*cos(2*s)+0.3*cos(s-t)]; % its 2x2 Jacobian
@@ -102,7 +102,7 @@ noting that it covers the domain when periodically extended:
 
 .. image:: ../pics/pois_nugrid.png
    :width: 40%
-           
+
 To solve on this grid, replace step 1 above by evaluating the Euler-Fourier formula using the quadrature scheme, which needs a type-1 NUFFT, and step 3 (evaluation on the nonuniform grid) by a type-2 NUFFT. Step 2 (the frequency filter) remains the same. Here is the demo code:
 
 .. code-block:: matlab
@@ -129,7 +129,7 @@ To solve on this grid, replace step 1 above by evaluating the Euler-Fourier form
     u = reshape(real(u),[n n]);
     fprintf('n=%d:\tNk=%d\tu(0,0) = %.15e\n',n,Nk,u(1,1))  % check conv at same pt
   end
-            
+
 Here a convergence parameter (``Nk = 0.5*n``) had to be set to
 choose how many modes to trust with the quadrature. Thus ``n`` is about
 twice what it needed to be in the uniform case, accounting for the stretching
@@ -145,13 +145,13 @@ and matches the FFT solution at the test point to 12 relative digits::
 
 .. image:: ../pics/pois_nufft.png
    :width: 90%
-           
+
 Finally, here is the decay of the modes $\hat{f}_k$ on a log plot, for the
 FFT and NUFFT versions. They are identical down to the level ``tol``:
 
 .. image:: ../pics/pois_fhat.png
    :width: 90%
-           
+
 The full code is at
 `tutorial/poisson2dnuquad.m <https://github.com/flatironinstitute/finufft/blob/master/tutorial/poisson2dnuquad.m>`_.
 
diff --git a/docs/tutorial/serieseval.rst b/docs/tutorial/serieseval.rst
index 038ca3194..ad8924fe3 100644
--- a/docs/tutorial/serieseval.rst
+++ b/docs/tutorial/serieseval.rst
@@ -53,7 +53,7 @@ Compare this to a naive calculation (which serves to remind us exactly what sum
   norm(c-cn,inf)
 
 ::
-                
+
   Elapsed time is 11.679265 seconds.
   ans =
         1.76508266507874e-11
@@ -69,10 +69,10 @@ multithreaded summation. (Naive summation with reversed loop order is even worse
 
 .. image:: ../pics/fser1d.png
    :width: 90%
- 
+
 See the full code `tutorial/serieseval1d.m <https://github.com/flatironinstitute/finufft/blob/master/tutorial/serieseval1d.m>`_ which also shows
 how to evaluate the same series on a uniform grid via the plain FFT.
-           
+
 2D Fourier series
 ~~~~~~~~~~~~~~~~~
 
@@ -104,7 +104,7 @@ target points you like:
 ::
 
   Elapsed time is 0.092743 seconds.
-     
+
 1 million modes to 1 million points in 92 milliseconds on a laptop is decent.
 We check the math (using a relative error measure) at just one (generic) point:
 
@@ -113,12 +113,12 @@ We check the math (using a relative error measure) at just one (generic) point:
   j = 1;                        % do math check on 1st target...
   c1 = sum(sum(fk.*exp(1i*(k1*x(j)+k2*y(j)))));
   abs(c1-c(j)) / norm(c,inf)
-                
+
 ::
-                    
+
   ans =
         2.30520830208365e-10
-            
+
 Finally we use a colored scatter plot to show the first $10\%$ of the points in the square, and see samples of the underlying random field (reminiscent of WMAP microwave background data):
 
 .. code-block:: matlab
@@ -126,9 +126,8 @@ Finally we use a colored scatter plot to show the first $10\%$ of the points in
   jplot = 1:1e5;           % indices to plot
   scatter(x(jplot),y(jplot),1.0,real(c(jplot)),'filled'); axis tight equal
   xlabel('x'); ylabel('y'); colorbar; title('Re f(x,y)');
-                
+
 .. image:: ../pics/fser2d.png
    :width: 70%
- 
-See the full code `tutorial/serieseval2d.m <https://github.com/flatironinstitute/finufft/blob/master/tutorial/serieseval2d.m>`_.
 
+See the full code `tutorial/serieseval2d.m <https://github.com/flatironinstitute/finufft/blob/master/tutorial/serieseval2d.m>`_.
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index eeda37050..a0fa360bf 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -5,39 +5,40 @@ set(EXAMPLES
     many1d1
     simple1d1
     simple1d1f
-    simulplans1d1)
+    simulplans1d1
+)
 set(EXAMPLES_OPENMP threadsafe1d1 threadsafe2d2f)
 set(EXAMPLES_C guru1d1c simple1d1c simple1d1cf)
 
 find_library(MATH_LIBRARY m)
 
 foreach(EXAMPLE ${EXAMPLES})
-  add_executable(${EXAMPLE} ${EXAMPLE}.cpp)
-  target_compile_features(${EXAMPLE} PRIVATE cxx_std_14)
-  target_link_libraries(${EXAMPLE} PRIVATE finufft)
-  if(CMAKE_PROJECT_NAME STREQUAL "FINUFFT")
-    enable_asan(${EXAMPLE})
-  endif()
+    add_executable(${EXAMPLE} ${EXAMPLE}.cpp)
+    target_compile_features(${EXAMPLE} PRIVATE cxx_std_14)
+    target_link_libraries(${EXAMPLE} PRIVATE finufft)
+    if(CMAKE_PROJECT_NAME STREQUAL "FINUFFT")
+        enable_asan(${EXAMPLE})
+    endif()
 endforeach()
 
 foreach(EXAMPLE ${EXAMPLES_C})
-  add_executable(${EXAMPLE} ${EXAMPLE}.c)
-  target_link_libraries(${EXAMPLE} PRIVATE finufft)
-  if(CMAKE_PROJECT_NAME STREQUAL "FINUFFT")
-    enable_asan(${EXAMPLE})
-  endif()
-  if(MATH_LIBRARY)
-    target_link_libraries(${EXAMPLE} PRIVATE ${MATH_LIBRARY})
-  endif()
+    add_executable(${EXAMPLE} ${EXAMPLE}.c)
+    target_link_libraries(${EXAMPLE} PRIVATE finufft)
+    if(CMAKE_PROJECT_NAME STREQUAL "FINUFFT")
+        enable_asan(${EXAMPLE})
+    endif()
+    if(MATH_LIBRARY)
+        target_link_libraries(${EXAMPLE} PRIVATE ${MATH_LIBRARY})
+    endif()
 endforeach()
 
 if(FINUFFT_USE_OPENMP)
-  foreach(EXAMPLE ${EXAMPLES_OPENMP})
-    add_executable(${EXAMPLE} ${EXAMPLE}.cpp)
-    target_link_libraries(${EXAMPLE} PRIVATE finufft OpenMP::OpenMP_CXX)
-    target_compile_features(${EXAMPLE} PRIVATE cxx_std_11)
-    if(CMAKE_PROJECT_NAME STREQUAL "FINUFFT")
-      enable_asan(${EXAMPLE})
-    endif()
-  endforeach()
+    foreach(EXAMPLE ${EXAMPLES_OPENMP})
+        add_executable(${EXAMPLE} ${EXAMPLE}.cpp)
+        target_link_libraries(${EXAMPLE} PRIVATE finufft OpenMP::OpenMP_CXX)
+        target_compile_features(${EXAMPLE} PRIVATE cxx_std_11)
+        if(CMAKE_PROJECT_NAME STREQUAL "FINUFFT")
+            enable_asan(${EXAMPLE})
+        endif()
+    endforeach()
 endif()
diff --git a/examples/cuda/CMakeLists.txt b/examples/cuda/CMakeLists.txt
index b9742a865..d08d606b1 100644
--- a/examples/cuda/CMakeLists.txt
+++ b/examples/cuda/CMakeLists.txt
@@ -1,10 +1,10 @@
 file(GLOB example_src "*.cpp")
 
 foreach(srcfile ${example_src})
-  string(REPLACE ".cpp" "" executable ${srcfile})
-  get_filename_component(executable ${executable} NAME)
-  add_executable(${executable} ${srcfile})
-  target_include_directories(${executable} PUBLIC ${CUFINUFFT_INCLUDE_DIRS})
-  target_link_libraries(${executable} cufinufft)
-  target_compile_features(${executable} PRIVATE cxx_std_17)
+    string(REPLACE ".cpp" "" executable ${srcfile})
+    get_filename_component(executable ${executable} NAME)
+    add_executable(${executable} ${srcfile})
+    target_include_directories(${executable} PUBLIC ${CUFINUFFT_INCLUDE_DIRS})
+    target_link_libraries(${executable} cufinufft)
+    target_compile_features(${executable} PRIVATE cxx_std_17)
 endforeach()
diff --git a/fortran/CMakeLists.txt b/fortran/CMakeLists.txt
index b02396cfe..64d035964 100644
--- a/fortran/CMakeLists.txt
+++ b/fortran/CMakeLists.txt
@@ -1,16 +1,27 @@
 add_library(
-  directft OBJECT directft/dirft1d.f directft/dirft1df.f directft/dirft2d.f
-                  directft/dirft2df.f directft/dirft3d.f directft/dirft3df.f)
+    directft
+    OBJECT
+    directft/dirft1d.f
+    directft/dirft1df.f
+    directft/dirft2d.f
+    directft/dirft2df.f
+    directft/dirft3d.f
+    directft/dirft3df.f
+)
 
-set(FORTRAN_EXAMPLES guru1d1 nufft1d_demo nufft2d_demo nufft2dmany_demo
-                     nufft3d_demo simple1d1)
+set(FORTRAN_EXAMPLES
+    guru1d1
+    nufft1d_demo
+    nufft2d_demo
+    nufft2dmany_demo
+    nufft3d_demo
+    simple1d1
+)
 
 foreach(EXAMPLE ${FORTRAN_EXAMPLES})
-  add_executable(fort_${EXAMPLE} examples/${EXAMPLE}.f)
-  add_executable(fort_${EXAMPLE}f examples/${EXAMPLE}f.f)
+    add_executable(fort_${EXAMPLE} examples/${EXAMPLE}.f)
+    add_executable(fort_${EXAMPLE}f examples/${EXAMPLE}f.f)
 
-  target_link_libraries(fort_${EXAMPLE} PRIVATE directft finufft
-                                                ${FINUFFT_FFTLIBS})
-  target_link_libraries(fort_${EXAMPLE}f PRIVATE directft finufft
-                                                 ${FINUFFT_FFTLIBS})
+    target_link_libraries(fort_${EXAMPLE} PRIVATE directft finufft ${FINUFFT_FFTLIBS})
+    target_link_libraries(fort_${EXAMPLE}f PRIVATE directft finufft ${FINUFFT_FFTLIBS})
 endforeach()
diff --git a/fortran/cmcl_license.txt b/fortran/cmcl_license.txt
index cd94a0ae4..0504ecbb9 100644
--- a/fortran/cmcl_license.txt
+++ b/fortran/cmcl_license.txt
@@ -8,13 +8,13 @@ Copyright (c) 2009-2014, Leslie Greengard, June-Yub Lee and Zydrunas Gimbutas
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met: 
+modification, are permitted provided that the following conditions are met:
 
 1. Redistributions of source code must retain the above copyright notice, this
-   list of conditions and the following disclaimer. 
+   list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright notice,
    this list of conditions and the following disclaimer in the documentation
-   and/or other materials provided with the distribution. 
+   and/or other materials provided with the distribution.
 
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
@@ -28,5 +28,5 @@ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 The views and conclusions contained in the software and documentation are those
-of the authors and should not be interpreted as representing official policies, 
+of the authors and should not be interpreted as representing official policies,
 either expressed or implied, of the FreeBSD Project.
diff --git a/fortran/directft/README b/fortran/directft/README
index 008f329df..bde21642f 100644
--- a/fortran/directft/README
+++ b/fortran/directft/README
@@ -1,3 +1,3 @@
 This directory contains the CMCL NUFFT direct summation implementations,
 plus single-precision versions by Alex Barnett, 2017.
-It also contains the legendary prini.f, which is currently unused.
\ No newline at end of file
+It also contains the legendary prini.f, which is currently unused.
diff --git a/fortran/directft/dirft1d.f b/fortran/directft/dirft1d.f
index 4e36286c1..ff964550f 100644
--- a/fortran/directft/dirft1d.f
+++ b/fortran/directft/dirft1d.f
@@ -1,8 +1,8 @@
-cc Copyright (C) 2004-2009: Leslie Greengard and June-Yub Lee 
+cc Copyright (C) 2004-2009: Leslie Greengard and June-Yub Lee
 cc Contact: greengard@cims.nyu.edu
 cc
 cc This software is being released under a FreeBSD license
-cc (see license.txt in this directory). 
+cc (see license.txt in this directory).
 c***********************************************************************
       subroutine dirft1d1(nj,xj,cj, iflag, ms,fk)
       implicit none
@@ -13,7 +13,7 @@ subroutine dirft1d1(nj,xj,cj, iflag, ms,fk)
 c     direct computation of nonuniform FFT
 c
 c                   nj
-c     fk(k1)    =  SUM cj(j) exp(+/-i k1 xj(j)) 
+c     fk(k1)    =  SUM cj(j) exp(+/-i k1 xj(j))
 c                  j=1
 c
 c     for -ms/2 <= k1 <= (ms-1)/2
@@ -64,8 +64,8 @@ subroutine dirft1d2(nj,xj,cj, iflag, ms,fk)
 c ----------------------------------------------------------------------
 c     direct computation of nonuniform FFT
 c
-c     cj(j) = SUM   fk(k1) exp(+/-i k1 xj(j)) 
-c             k1  
+c     cj(j) = SUM   fk(k1) exp(+/-i k1 xj(j))
+c             k1
 c                            for j = 1,...,nj
 c
 c     where -ms/2 <= k1 <= (ms-1)/2
@@ -109,8 +109,8 @@ subroutine dirft1d3(nj,xj,cj, iflag, nk,sk,fk)
 c     direct computation of nonuniform FFT
 c
 c              nj
-c     fk(k) = SUM cj(j) exp(+/-i s(k) xj(j)) 
-c             j=1                   
+c     fk(k) = SUM cj(j) exp(+/-i s(k) xj(j))
+c             j=1
 c
 c                    for k = 1, ..., nk
 c
diff --git a/fortran/directft/dirft1df.f b/fortran/directft/dirft1df.f
index 4756670d6..cf0edd527 100644
--- a/fortran/directft/dirft1df.f
+++ b/fortran/directft/dirft1df.f
@@ -1,4 +1,4 @@
-cc Copyright (C) 2004-2009: Leslie Greengard and June-Yub Lee 
+cc Copyright (C) 2004-2009: Leslie Greengard and June-Yub Lee
 cc Contact: greengard@cims.nyu.edu
 cc
 cc This software is being released under a FreeBSD license
@@ -14,7 +14,7 @@ subroutine dirft1d1f(nj,xj,cj, iflag, ms,fk)
 c     direct computation of nonuniform FFT
 c
 c                   nj
-c     fk(k1)    =  SUM cj(j) exp(+/-i k1 xj(j)) 
+c     fk(k1)    =  SUM cj(j) exp(+/-i k1 xj(j))
 c                  j=1
 c
 c     for -ms/2 <= k1 <= (ms-1)/2
@@ -65,8 +65,8 @@ subroutine dirft1d2f(nj,xj,cj, iflag, ms,fk)
 c ----------------------------------------------------------------------
 c     direct computation of nonuniform FFT
 c
-c     cj(j) = SUM   fk(k1) exp(+/-i k1 xj(j)) 
-c             k1  
+c     cj(j) = SUM   fk(k1) exp(+/-i k1 xj(j))
+c             k1
 c                            for j = 1,...,nj
 c
 c     where -ms/2 <= k1 <= (ms-1)/2
@@ -110,8 +110,8 @@ subroutine dirft1d3f(nj,xj,cj, iflag, nk,sk,fk)
 c     direct computation of nonuniform FFT
 c
 c              nj
-c     fk(k) = SUM cj(j) exp(+/-i s(k) xj(j)) 
-c             j=1                   
+c     fk(k) = SUM cj(j) exp(+/-i s(k) xj(j))
+c             j=1
 c
 c                    for k = 1, ..., nk
 c
diff --git a/fortran/directft/dirft2d.f b/fortran/directft/dirft2d.f
index eecae6392..ad763f8d9 100644
--- a/fortran/directft/dirft2d.f
+++ b/fortran/directft/dirft2d.f
@@ -1,8 +1,8 @@
-cc Copyright (C) 2004-2009: Leslie Greengard and June-Yub Lee 
+cc Copyright (C) 2004-2009: Leslie Greengard and June-Yub Lee
 cc Contact: greengard@cims.nyu.edu
-cc 
+cc
 cc This software is being released under a FreeBSD license
-cc (see license.txt in this directory). 
+cc (see license.txt in this directory).
 cc
       subroutine dirft2d1(nj,xj,yj,cj, iflag, ms,mt,fk)
       implicit none
@@ -175,7 +175,7 @@ subroutine dirft2d3(nj,xj,yj,cj, iflag, nk,sk,tk,fk)
 c
 c              nj
 c     fk(k) = SUM cj(j) exp(+/-i s(k) xj(j)) exp(+/-i t(k) yj(j))
-c             j=1                   
+c             j=1
 c
 c                    for k = 1, ..., nk
 c
diff --git a/fortran/directft/dirft2df.f b/fortran/directft/dirft2df.f
index 8ea5278d4..fe8205f57 100644
--- a/fortran/directft/dirft2df.f
+++ b/fortran/directft/dirft2df.f
@@ -1,9 +1,9 @@
-cc Copyright (C) 2004-2009: Leslie Greengard and June-Yub Lee 
+cc Copyright (C) 2004-2009: Leslie Greengard and June-Yub Lee
 cc Contact: greengard@cims.nyu.edu
-cc 
+cc
 cc This software is being release%d under a FreeBSD license
 cc (see license.txt in this directory).
-cc Single-prec version Barnett 4/5/17    
+cc Single-prec version Barnett 4/5/17
 cc
       subroutine dirft2d1f(nj,xj,yj,cj, iflag, ms,mt,fk)
       implicit none
@@ -176,7 +176,7 @@ subroutine dirft2d3f(nj,xj,yj,cj, iflag, nk,sk,tk,fk)
 c
 c              nj
 c     fk(k) = SUM cj(j) exp(+/-i s(k) xj(j)) exp(+/-i t(k) yj(j))
-c             j=1                   
+c             j=1
 c
 c                    for k = 1, ..., nk
 c
diff --git a/fortran/directft/dirft3d.f b/fortran/directft/dirft3d.f
index 6afbc76b5..bea4ba3b2 100644
--- a/fortran/directft/dirft3d.f
+++ b/fortran/directft/dirft3d.f
@@ -1,8 +1,8 @@
-cc Copyright (C) 2004-2009: Leslie Greengard and June-Yub Lee 
+cc Copyright (C) 2004-2009: Leslie Greengard and June-Yub Lee
 cc Contact: greengard@cims.nyu.edu
-cc 
+cc
 cc This software is being released under a FreeBSD license
-cc (see license.txt in this directory). 
+cc (see license.txt in this directory).
 cc
 ************************************************************************
       subroutine dirft3d1(nj,xj,yj,zj,cj,iflag,ms,mt,mu,fk)
@@ -18,7 +18,7 @@ subroutine dirft3d1(nj,xj,yj,zj,cj,iflag,ms,mt,mu,fk)
 c                     j=1       exp(+/-i k2 yj(j)) *
 c                                 exp(+/-i k3 zj(j))
 c
-c     for -ms/2 <= k1 <= (ms-1)/2, 
+c     for -ms/2 <= k1 <= (ms-1)/2,
 c         -mt/2 <= k2 <= (mt-1)/2
 c         -mu/2 <= k3 <= (mu-1)/2
 c
@@ -120,13 +120,13 @@ subroutine dirft3d2(nj,xj,yj,zj,cj, iflag, ms,mt,mu,fk)
 c     direct computation of nonuniform FFT
 c
 c
-c     cj(j) = SUM SUM SUM  fk(k1,k2,k3) exp(+/-i k1 xj(j)) * 
+c     cj(j) = SUM SUM SUM  fk(k1,k2,k3) exp(+/-i k1 xj(j)) *
 c             k1  k2  k3                exp(+/-i k2 yj(j))
 c                                       exp(+/-i k3 zj(j))
 c
 c                            for j = 1,...,nj
 c
-c     where -ms/2 <= k1 <= (ms-1)/2 
+c     where -ms/2 <= k1 <= (ms-1)/2
 c           -mt/2 <= k2 <= (mt-1)/2
 c           -mu/2 <= k3 <= (mu-1)/2
 c
@@ -274,4 +274,3 @@ subroutine dirft3d3(nj,xj,yj,zj,cj, iflag, nk,sk,tk,uk,fk)
          enddo
       enddo
       end
-
diff --git a/fortran/directft/dirft3df.f b/fortran/directft/dirft3df.f
index 05e788ce0..094f96a74 100644
--- a/fortran/directft/dirft3df.f
+++ b/fortran/directft/dirft3df.f
@@ -1,8 +1,8 @@
-cc Copyright (C) 2004-2009: Leslie Greengard and June-Yub Lee 
+cc Copyright (C) 2004-2009: Leslie Greengard and June-Yub Lee
 cc Contact: greengard@cims.nyu.edu
-cc 
+cc
 cc This software is being released under a FreeBSD license
-cc (see license.txt in this directory). 
+cc (see license.txt in this directory).
 cc Single-prec version Barnett 4/5/17
 cc
 ************************************************************************
@@ -19,7 +19,7 @@ subroutine dirft3d1f(nj,xj,yj,zj,cj,iflag,ms,mt,mu,fk)
 c                     j=1       exp(+/-i k2 yj(j)) *
 c                                 exp(+/-i k3 zj(j))
 c
-c     for -ms/2 <= k1 <= (ms-1)/2, 
+c     for -ms/2 <= k1 <= (ms-1)/2,
 c         -mt/2 <= k2 <= (mt-1)/2
 c         -mu/2 <= k3 <= (mu-1)/2
 c
@@ -121,13 +121,13 @@ subroutine dirft3d2f(nj,xj,yj,zj,cj, iflag, ms,mt,mu,fk)
 c     direct computation of nonuniform FFT
 c
 c
-c     cj(j) = SUM SUM SUM  fk(k1,k2,k3) exp(+/-i k1 xj(j)) * 
+c     cj(j) = SUM SUM SUM  fk(k1,k2,k3) exp(+/-i k1 xj(j)) *
 c             k1  k2  k3                exp(+/-i k2 yj(j))
 c                                       exp(+/-i k3 zj(j))
 c
 c                            for j = 1,...,nj
 c
-c     where -ms/2 <= k1 <= (ms-1)/2 
+c     where -ms/2 <= k1 <= (ms-1)/2
 c           -mt/2 <= k2 <= (mt-1)/2
 c           -mu/2 <= k3 <= (mu-1)/2
 c
@@ -275,4 +275,3 @@ subroutine dirft3d3f(nj,xj,yj,zj,cj, iflag, nk,sk,tk,uk,fk)
          enddo
       enddo
       end
-
diff --git a/fortran/directft/prini.f b/fortran/directft/prini.f
index 31a2536b6..95a838f90 100644
--- a/fortran/directft/prini.f
+++ b/fortran/directft/prini.f
@@ -132,7 +132,7 @@ subroutine msgmerge(a,b,c)
         do 1200 i=1,1000
 c
         if(a(i) .eq. ast) goto 1400
-        c(i)=a(i)       
+        c(i)=a(i)
         iadd=i
  1200 continue
 c
diff --git a/fortran/examples/guru1d1.f b/fortran/examples/guru1d1.f
index 22aed424a..4898001a7 100755
--- a/fortran/examples/guru1d1.f
+++ b/fortran/examples/guru1d1.f
@@ -4,7 +4,7 @@
 c     Legacy-style: f77, plus dynamic allocation & derived types from f90.
 
 c     To compile (linux/GCC) from this directory, use eg (paste to one line):
-      
+
 c     gfortran -fopenmp -I../../include -I/usr/include guru1d1.f
 c     ../../lib/libfinufft.so -lfftw3 -lfftw3_omp -lgomp -lstdc++ -o guru1d1
 
@@ -30,15 +30,15 @@ program guru1d1
       integer ttype,dim,ntrans
 c     to pass null pointers to unused arguments...
       real*8, pointer :: dummy => null()
-      
+
 c     this is what you use as the "opaque" ptr to ptr to finufft_plan...
       integer*8 plan
 c     this is how you create the options struct in fortran...
       type(finufft_opts) opts
 c     or this is if you want default opts, make a null pointer...
       type(finufft_opts), pointer :: defopts => null()
-      
-      
+
+
 c     how many nonuniform pts
       M = 1000000
 c     how many modes (not too much since FFTW_MEASURE slow later)
@@ -83,7 +83,7 @@ program guru1d1
       endif
       call finufft_destroy(plan,ier)
 
-      
+
 c     math test: single output mode with given freq (not array index) k
       ktest = N/3
       fktest = dcmplx(0,0)
@@ -133,6 +133,6 @@ program guru1d1
          print *,'failed! ier=',ier
       endif
       call finufft_destroy(plan,ier)
-      
+
       stop
       end
diff --git a/fortran/examples/guru1d1f.f b/fortran/examples/guru1d1f.f
index 316ad57ec..d95d695ec 100755
--- a/fortran/examples/guru1d1f.f
+++ b/fortran/examples/guru1d1f.f
@@ -4,7 +4,7 @@
 c     Legacy-style: f77, plus dynamic allocation & derived types from f90.
 
 c     To compile (linux/GCC) from this directory, use eg (paste to one line):
-      
+
 c     gfortran-9 -fopenmp -I../../include -I/usr/include guru1d1f.f
 c     -L../../lib -lfinufftf -o guru1d1f
 
@@ -37,7 +37,7 @@ program guru1d1f
       type(finufft_opts) opts
 c     or this is if you want default opts, make a null pointer...
       type(finufft_opts), pointer :: defopts => null()
-   
+
 c     how many nonuniform pts
       M = 200000
 c     how many modes (not too much since FFTW_MEASURE slow later)
@@ -82,7 +82,7 @@ program guru1d1f
       endif
       call finufftf_destroy(plan,ier)
 
-      
+
 c     math test: single output mode with given freq (not array index) k
       ktest = N/3
       fktest = cmplx(0,0)
@@ -132,6 +132,6 @@ program guru1d1f
          print *,'failed! ier=',ier
       endif
       call finufftf_destroy(plan,ier)
-      
+
       stop
       end
diff --git a/fortran/examples/nufft1d_demo.f b/fortran/examples/nufft1d_demo.f
index e63a7e434..a57cd6997 100755
--- a/fortran/examples/nufft1d_demo.f
+++ b/fortran/examples/nufft1d_demo.f
@@ -16,7 +16,7 @@
 c
       program nufft1d_demo
       implicit none
-      
+
 c     our fortran-header, always needed
       include 'finufft.fh'
 c
diff --git a/fortran/examples/nufft2d_demo.f b/fortran/examples/nufft2d_demo.f
index b37cfeffc..b07c3ea4a 100755
--- a/fortran/examples/nufft2d_demo.f
+++ b/fortran/examples/nufft2d_demo.f
@@ -16,7 +16,7 @@
 c
       program nufft2d_demo
       implicit none
-      
+
 c     our fortran-header, always needed
       include 'finufft.fh'
 c
@@ -113,7 +113,7 @@ program nufft2d_demo
          print *, ' ier = ',ier
          call errcomp(fk0,fk1,nk,err)
          print *, ' type 3 error = ',err
-      enddo 
+      enddo
       stop
       end
 c
diff --git a/fortran/examples/nufft2d_demof.f b/fortran/examples/nufft2d_demof.f
index b649f6109..3500b441e 100755
--- a/fortran/examples/nufft2d_demof.f
+++ b/fortran/examples/nufft2d_demof.f
@@ -16,7 +16,7 @@
 c
       program nufft2d_demof
       implicit none
-      
+
 c     our fortran-header, always needed
       include 'finufft.fh'
 c
@@ -108,7 +108,7 @@ program nufft2d_demof
          print *, ' ier = ',ier
          call errcomp(fk0,fk1,nk,err)
          print *, ' type 3 error = ',err
-      enddo 
+      enddo
       stop
       end
 c
diff --git a/fortran/examples/nufft2dmany_demo.f b/fortran/examples/nufft2dmany_demo.f
index 605237161..f8a4be981 100755
--- a/fortran/examples/nufft2dmany_demo.f
+++ b/fortran/examples/nufft2dmany_demo.f
@@ -7,7 +7,7 @@
 c
 c     Vectorized (many data vectors) demo type 1,2 by Melody Shih, 2018,
 c     type 3 by Alex Barnett, 2020. Based on nufft2d_demo.f.
-c     Also see: ../README.     
+c     Also see: ../README.
 c
 c     Compile with, eg (GCC, multithreaded; paste to a single line):
 c
@@ -29,7 +29,7 @@ program nufft2dmany_demo
 c     for default opts, make a null pointer...
       type(finufft_opts), pointer :: defopts => null()
 
-c     
+c
 c     --------------------------------------------------
 c     create some test data
 c     --------------------------------------------------
@@ -91,7 +91,7 @@ program nufft2dmany_demo
 c     call 2D Type 1 method
 c     -----------------------
 c
-         call finufft2d1many(ntrans,nj,xj,yj,cj,iflag, 
+         call finufft2d1many(ntrans,nj,xj,yj,cj,iflag,
      &                         eps,ms,mt,fk1,defopts,ier)
          do d = 1, ntrans
             call dirft2d1(nj,xj,yj,cj(1+(d-1)*nj:d*nj),iflag,ms,mt,
@@ -134,7 +134,7 @@ program nufft2dmany_demo
             maxerr = max(maxerr,err)
          enddo
          print *, ' max type 3 error = ',err
-      enddo 
+      enddo
       stop
       end
 c
diff --git a/fortran/examples/nufft2dmany_demof.f b/fortran/examples/nufft2dmany_demof.f
index d0bf80115..ac5f72abc 100755
--- a/fortran/examples/nufft2dmany_demof.f
+++ b/fortran/examples/nufft2dmany_demof.f
@@ -7,7 +7,7 @@
 c
 c     Vectorized (many data vectors) demo type 1,2 by Melody Shih, 2018,
 c     type 3 & single-prec by Alex Barnett, 2020. Based on nufft2d_demo.f.
-c     Also see: ../README.     
+c     Also see: ../README.
 c
 c     Compile with, eg (GCC, multithreaded; paste to a single line):
 c
@@ -17,7 +17,7 @@
 c     Note: you must link to single-precision build of FINUFFT
       program nufft2dmany_demof
       implicit none
-      
+
 c     our fortran-header, always needed
       include 'finufft.fh'
 c
@@ -86,7 +86,7 @@ program nufft2dmany_demof
 c     call 2D Type 1 method
 c     -----------------------
 c
-         call finufftf2d1many(ntrans,nj,xj,yj,cj,iflag, 
+         call finufftf2d1many(ntrans,nj,xj,yj,cj,iflag,
      &                         eps,ms,mt,fk1,defopts,ier)
          do d = 1, ntrans
             call dirft2d1f(nj,xj,yj,cj(1+(d-1)*nj:d*nj),iflag,ms,mt,
@@ -129,7 +129,7 @@ program nufft2dmany_demof
             maxerr = max(maxerr,err)
          enddo
          print *, ' max type 3 error = ',err
-      enddo 
+      enddo
       stop
       end
 c
diff --git a/fortran/examples/nufft3d_demo.f b/fortran/examples/nufft3d_demo.f
index af04afa5f..e8166d6cb 100755
--- a/fortran/examples/nufft3d_demo.f
+++ b/fortran/examples/nufft3d_demo.f
@@ -28,7 +28,7 @@ program nufft3d_demo
       complex*16, allocatable :: cj(:),cj0(:),cj1(:),fk0(:),fk1(:)
 c     for default opts, make a null pointer...
       type(finufft_opts), pointer :: defopts => null()
-      
+
 c
 c     --------------------------------------------------
 c     create some test data
@@ -122,7 +122,7 @@ program nufft3d_demo
          print *, ' ier = ',ier
          call errcomp(fk0,fk1,nk,err)
          print *, ' type 3 error = ',err
-      enddo 
+      enddo
       stop
       end
 c
diff --git a/fortran/examples/nufft3d_demof.f b/fortran/examples/nufft3d_demof.f
index 2e5a9e21f..89cd687b1 100755
--- a/fortran/examples/nufft3d_demof.f
+++ b/fortran/examples/nufft3d_demof.f
@@ -16,7 +16,7 @@
 c
       program nufft3d_demof
       implicit none
-      
+
 c     our fortran-header, always needed
       include 'finufft.fh'
 
@@ -29,7 +29,7 @@ program nufft3d_demof
 c     for default opts, make a null pointer...
       type(finufft_opts), pointer :: defopts => null()
 
-c     
+c
 c     --------------------------------------------------
 c     create some test data
 c     --------------------------------------------------
@@ -117,7 +117,7 @@ program nufft3d_demof
          print *, ' ier = ',ier
          call errcomp(fk0,fk1,nk,err)
          print *, ' type 3 error = ',err
-      enddo 
+      enddo
       stop
       end
 c
diff --git a/fortran/examples/simple1d1.f b/fortran/examples/simple1d1.f
index 534f9b255..98ba187d8 100755
--- a/fortran/examples/simple1d1.f
+++ b/fortran/examples/simple1d1.f
@@ -4,7 +4,7 @@
 c     Legacy-style: f77, plus dynamic allocation & derived types from f90.
 
 c     To compile (linux/GCC) from this directory, use eg (paste to one line):
-      
+
 c     gfortran -fopenmp -I../../include simple1d1.f -o simple1d1
 c     ../../lib/libfinufft.so -lfftw3 -lfftw3_omp -lgomp -lstdc++
 
@@ -12,7 +12,7 @@
 
       program simple1d1
       implicit none
-      
+
 c     our fortran-header, always needed
       include 'finufft.fh'
 
@@ -29,7 +29,7 @@ program simple1d1
       type(finufft_opts) opts
 c     or this is if you want default opts, make a null pointer...
       type(finufft_opts), pointer :: defopts => null()
-      
+
 c     how many nonuniform pts
       M = 2000000
 c     how many modes
@@ -77,7 +77,7 @@ program simple1d1
       ktestindex = ktest + N/2 + 1
       print '("rel err for mode k=",i10," is ",e10.2)',ktest,
      $     cdabs(fk(ktestindex)-fktest)/fmax
-      
+
 c     do another transform, but now first setting some options...
       print *,''
       print *, 'setting new options, rerun simple interface...'
@@ -96,6 +96,6 @@ program simple1d1
       else
          print *,'failed! ier=',ier
       endif
-      
+
       stop
       end
diff --git a/fortran/examples/simple1d1.f90 b/fortran/examples/simple1d1.f90
index f173cc6df..f0fb3378a 100755
--- a/fortran/examples/simple1d1.f90
+++ b/fortran/examples/simple1d1.f90
@@ -5,18 +5,18 @@
 
 ! To compile (linux/GCC) from this directory, note the module also has to be
 ! compiled, eg:
-      
+
 ! gfortran -fopenmp ../../include/finufft_mod.f90 simple1d1.f90 -o simple1d1_f90 ../../lib/libfinufft.so -lfftw3 -lfftw3_omp -lgomp -lstdc++
 
 ! Alex Barnett, to demo Reinhard Neder f90 module, 1/20/23.
 
 program simple1d1
-  
+
   ! use the module (which happens to live in ../../include)
   use finufft_mod
-  
+
   implicit none
-  
+
   ! note some inputs are int (int*4) but others BIGINT (int*8)
   integer ier,iflag
   integer*8 N,ktest,M,j,k,ktestindex,t1,t2,crate
@@ -25,12 +25,12 @@ program simple1d1
   parameter (pi=3.141592653589793238462643383279502884197d0)
   complex*16, allocatable :: cj(:),fk(:)
   complex*16 fktest
-  
+
   ! this is how you create the options struct in fortran...
   type(finufft_opts) opts
   ! or this is if you want default opts, make a null pointer...
   type(finufft_opts), pointer :: defopts => null()
-  
+
   ! how many nonuniform pts
   M = 2000000
   ! how many modes
@@ -46,7 +46,7 @@ program simple1d1
      xj(j) = pi * dcos(pi*j/M)
      cj(j) = dcmplx( dsin((100d0*j)/M), dcos(1.0+(50d0*j)/M))
   enddo
-  
+
   call system_clock(t1)
   ! mandatory parameters to FINUFFT: sign of +-i in NUFFT
   iflag = 1
@@ -62,7 +62,7 @@ program simple1d1
   else
      print *,'failed! ier=',ier
   endif
-  
+
   ! math test: single output mode with given freq (not array index) k
   ktest = N/3
   fktest = dcmplx(0,0)
@@ -78,7 +78,7 @@ program simple1d1
   ktestindex = ktest + N/2 + 1
   print '("rel err for mode k=",i10," is ",e10.2)',ktest, &
        cdabs(fk(ktestindex)-fktest)/fmax
-      
+
   ! do another transform, but now first setting some options...
   print *,''
   print *, 'setting new options, rerun simple interface...'
@@ -97,6 +97,6 @@ program simple1d1
   else
      print *,'failed! ier=',ier
   endif
-  
+
   stop
 end program simple1d1
diff --git a/fortran/examples/simple1d1f.f b/fortran/examples/simple1d1f.f
index 3e2c8106a..f11d01abc 100755
--- a/fortran/examples/simple1d1f.f
+++ b/fortran/examples/simple1d1f.f
@@ -4,7 +4,7 @@
 c     Legacy-style: f77, plus dynamic allocation & derived types from f90.
 
 c     To compile (linux/GCC) from this directory, use eg (paste to one line):
-      
+
 c     gfortran -fopenmp -I../../include simple1d1f.f -o simple1d1f
 c     -L../../lib -lfinufftf
 
@@ -12,7 +12,7 @@
 
       program simple1d1f
       implicit none
-      
+
 c     our fortran header, always needed...
       include 'finufft.fh'
 
@@ -29,7 +29,7 @@ program simple1d1f
       type(finufft_opts) opts
 c     or this is if you want default opts, make a null pointer...
       type(finufft_opts), pointer :: defopts => null()
-     
+
 c     how many nonuniform pts
       M = 200000
 c     how many modes
@@ -77,7 +77,7 @@ program simple1d1f
       ktestindex = ktest + N/2 + 1
       print '("rel err for mode k=",i10," is ",e10.2)',ktest,
      $     cabs(fk(ktestindex)-fktest)/fmax
-      
+
 c     do another transform, but now first setting some options...
       print *,''
       print *, 'setting new options, rerun simple interface...'
@@ -97,6 +97,6 @@ program simple1d1f
       else
          print *,'failed! ier=',ier
       endif
-      
+
       stop
       end
diff --git a/include/cufinufft/contrib/helper_cuda.h b/include/cufinufft/contrib/helper_cuda.h
index c3a31bd2b..7aff136df 100644
--- a/include/cufinufft/contrib/helper_cuda.h
+++ b/include/cufinufft/contrib/helper_cuda.h
@@ -134,7 +134,7 @@ static const char *cufftGetErrorString(cufftResult error) {
 }
 
 template<typename T>
-int check(T result, char const *const func, const char *const file, int const line) {
+int check(T result, const char *const func, const char *const file, const int line) {
   if (result) {
     fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", file, line,
             static_cast<unsigned int>(result), _cudaGetErrorEnum(result), func);
diff --git a/include/cufinufft/impl.h b/include/cufinufft/impl.h
index 1be81c1df..26228704a 100644
--- a/include/cufinufft/impl.h
+++ b/include/cufinufft/impl.h
@@ -611,23 +611,23 @@ int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_
   if (d_plan->dim > 0) {
     const auto ig1 = T(1) / d_plan->type3_params.gam1;
     const auto C1  = -d_plan->type3_params.C1;
-    thrust::transform(
-        thrust::cuda::par.on(stream), d_kx, d_kx + M, d_plan->kx,
-        [ig1, C1] __host__ __device__(const T x) -> T { return (x + C1) * ig1; });
+    thrust::transform(thrust::cuda::par.on(stream), d_kx, d_kx + M, d_plan->kx,
+                      [ig1, C1] __host__
+                      __device__(const T x) -> T { return (x + C1) * ig1; });
   }
   if (d_plan->dim > 1) {
     const auto ig2 = T(1) / d_plan->type3_params.gam2;
     const auto C2  = -d_plan->type3_params.C2;
-    thrust::transform(
-        thrust::cuda::par.on(stream), d_ky, d_ky + M, d_plan->ky,
-        [ig2, C2] __host__ __device__(const T x) -> T { return (x + C2) * ig2; });
+    thrust::transform(thrust::cuda::par.on(stream), d_ky, d_ky + M, d_plan->ky,
+                      [ig2, C2] __host__
+                      __device__(const T x) -> T { return (x + C2) * ig2; });
   }
   if (d_plan->dim > 2) {
     const auto ig3 = T(1) / d_plan->type3_params.gam3;
     const auto C3  = -d_plan->type3_params.C3;
-    thrust::transform(
-        thrust::cuda::par.on(stream), d_kz, d_kz + M, d_plan->kz,
-        [ig3, C3] __host__ __device__(const T x) -> T { return (x + C3) * ig3; });
+    thrust::transform(thrust::cuda::par.on(stream), d_kz, d_kz + M, d_plan->kz,
+                      [ig3, C3] __host__
+                      __device__(const T x) -> T { return (x + C3) * ig3; });
   }
   if (d_plan->type3_params.D1 != 0 || d_plan->type3_params.D2 != 0 ||
       d_plan->type3_params.D3 != 0) {
@@ -646,8 +646,8 @@ int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_
     const auto realsign = d_plan->iflag >= 0 ? T(1) : T(-1);
     thrust::transform(
         thrust::cuda::par.on(stream), iterator, iterator + M, d_plan->prephase,
-        [D1, D2, D3, realsign] __host__ __device__(
-            const thrust::tuple<T, T, T> &tuple) -> cuda_complex<T> {
+        [D1, D2, D3, realsign] __host__
+        __device__(const thrust::tuple<T, T, T> &tuple) -> cuda_complex<T> {
           const auto x = thrust::get<0>(tuple);
           const auto y = thrust::get<1>(tuple);
           const auto z = thrust::get<2>(tuple);
@@ -667,23 +667,23 @@ int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_
   if (d_plan->dim > 0) {
     const auto scale = d_plan->type3_params.h1 * d_plan->type3_params.gam1;
     const auto D1    = -d_plan->type3_params.D1;
-    thrust::transform(
-        thrust::cuda::par.on(stream), d_s, d_s + N, d_plan->d_Sp,
-        [scale, D1] __host__ __device__(const T s) -> T { return scale * (s + D1); });
+    thrust::transform(thrust::cuda::par.on(stream), d_s, d_s + N, d_plan->d_Sp,
+                      [scale, D1] __host__
+                      __device__(const T s) -> T { return scale * (s + D1); });
   }
   if (d_plan->dim > 1) {
     const auto scale = d_plan->type3_params.h2 * d_plan->type3_params.gam2;
     const auto D2    = -d_plan->type3_params.D2;
-    thrust::transform(
-        thrust::cuda::par.on(stream), d_t, d_t + N, d_plan->d_Tp,
-        [scale, D2] __host__ __device__(const T t) -> T { return scale * (t + D2); });
+    thrust::transform(thrust::cuda::par.on(stream), d_t, d_t + N, d_plan->d_Tp,
+                      [scale, D2] __host__
+                      __device__(const T t) -> T { return scale * (t + D2); });
   }
   if (d_plan->dim > 2) {
     const auto scale = d_plan->type3_params.h3 * d_plan->type3_params.gam3;
     const auto D3    = -d_plan->type3_params.D3;
-    thrust::transform(
-        thrust::cuda::par.on(stream), d_u, d_u + N, d_plan->d_Up,
-        [scale, D3] __host__ __device__(const T u) -> T { return scale * (u + D3); });
+    thrust::transform(thrust::cuda::par.on(stream), d_u, d_u + N, d_plan->d_Up,
+                      [scale, D3] __host__
+                      __device__(const T u) -> T { return scale * (u + D3); });
   }
   { // here we declare phi_hat1, phi_hat2, and phi_hat3
     // and the precomputed data for the fseries kernel
@@ -738,16 +738,16 @@ int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_
                            dim > 1 ? phi_hat2.begin() : phi_hat1.begin(),
                            // to avoid out of bounds access, use phi_hat1 if dim < 3
                            dim > 2 ? phi_hat3.begin() : phi_hat1.begin()));
-    thrust::transform(
-        thrust::cuda::par.on(stream), phi_hat_iterator, phi_hat_iterator + N,
-        d_plan->deconv,
-        [dim] __host__ __device__(const thrust::tuple<T, T, T> tuple) -> cuda_complex<T> {
-          auto phiHat = thrust::get<0>(tuple);
-          // in case dim < 2 or dim < 3, multiply by 1
-          phiHat *= (dim > 1) ? thrust::get<1>(tuple) : T(1);
-          phiHat *= (dim > 2) ? thrust::get<2>(tuple) : T(1);
-          return {T(1) / phiHat, T(0)};
-        });
+    thrust::transform(thrust::cuda::par.on(stream), phi_hat_iterator,
+                      phi_hat_iterator + N, d_plan->deconv,
+                      [dim] __host__
+                      __device__(const thrust::tuple<T, T, T> tuple) -> cuda_complex<T> {
+                        auto phiHat = thrust::get<0>(tuple);
+                        // in case dim < 2 or dim < 3, multiply by 1
+                        phiHat *= (dim > 1) ? thrust::get<1>(tuple) : T(1);
+                        phiHat *= (dim > 2) ? thrust::get<2>(tuple) : T(1);
+                        return {T(1) / phiHat, T(0)};
+                      });
 
     if (is_c_finite && is_c_nonzero) {
       const auto c1       = d_plan->type3_params.C1;
@@ -764,9 +764,9 @@ int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_
       thrust::transform(
           thrust::cuda::par.on(stream), phase_iterator, phase_iterator + N,
           d_plan->deconv, d_plan->deconv,
-          [c1, c2, c3, d1, d2, d3, realsign] __host__ __device__(
-              const thrust::tuple<T, T, T> tuple, cuda_complex<T> deconv)
-              -> cuda_complex<T> {
+          [c1, c2, c3, d1, d2, d3, realsign] __host__
+          __device__(const thrust::tuple<T, T, T> tuple, cuda_complex<T> deconv)
+          -> cuda_complex<T> {
             // d2 and d3 are 0 if dim < 2 and dim < 3
             const auto phase = c1 * (thrust::get<0>(tuple) + d1) +
                                c2 * (thrust::get<1>(tuple) + d2) +
diff --git a/include/cufinufft/spreadinterp.h b/include/cufinufft/spreadinterp.h
index aebf4bb0b..809ae8306 100644
--- a/include/cufinufft/spreadinterp.h
+++ b/include/cufinufft/spreadinterp.h
@@ -10,7 +10,7 @@ namespace cufinufft {
 namespace spreadinterp {
 
 template<typename T>
-static __forceinline__ __device__ constexpr T cudaFMA(const T a, const T b, const T c) {
+static constexpr __forceinline__ __device__ T cudaFMA(const T a, const T b, const T c) {
   if constexpr (std::is_same_v<T, float>) {
     // fused multiply-add, round to nearest even
     return __fmaf_rn(a, b, c);
diff --git a/include/cufinufft/types.h b/include/cufinufft/types.h
index 5b2fba790..d36613833 100644
--- a/include/cufinufft/types.h
+++ b/include/cufinufft/types.h
@@ -100,10 +100,10 @@ template<typename T> struct cufinufft_plan_t {
   cudaStream_t stream;
 };
 
-template<typename T> constexpr static inline cufftType_t cufft_type();
-template<> constexpr inline cufftType_t cufft_type<float>() { return CUFFT_C2C; }
+template<typename T> static inline constexpr cufftType_t cufft_type();
+template<> inline constexpr cufftType_t cufft_type<float>() { return CUFFT_C2C; }
 
-template<> constexpr inline cufftType_t cufft_type<double>() { return CUFFT_Z2Z; }
+template<> inline constexpr cufftType_t cufft_type<double>() { return CUFFT_Z2Z; }
 
 static inline cufftResult cufft_ex(cufftHandle plan, cufftComplex *idata,
                                    cufftComplex *odata, int direction) {
diff --git a/matlab/CMakeLists.txt b/matlab/CMakeLists.txt
index 92bdff37c..a35db66ae 100644
--- a/matlab/CMakeLists.txt
+++ b/matlab/CMakeLists.txt
@@ -7,7 +7,6 @@ file(GLOB FINUFFT_MATLAB_M_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/*.m)
 add_custom_command(
     TARGET finufft_mex
     POST_BUILD
-    COMMAND ${CMAKE_COMMAND} -E copy
-    ${FINUFFT_MATLAB_M_SOURCES}
-    ${CMAKE_CURRENT_BINARY_DIR}/$<CONFIG>
-    VERBATIM)
+    COMMAND ${CMAKE_COMMAND} -E copy ${FINUFFT_MATLAB_M_SOURCES} ${CMAKE_CURRENT_BINARY_DIR}/$<CONFIG>
+    VERBATIM
+)
diff --git a/matlab/finufft1d1.m b/matlab/finufft1d1.m
index c24e85642..0bb2bc739 100644
--- a/matlab/finufft1d1.m
+++ b/matlab/finufft1d1.m
@@ -43,7 +43,6 @@
 %  * For more details about the opts fields, see ../docs/opts.rst
 %  * See ERRHANDLER, VALID_* and FINUFFT_PLAN for possible warning/error IDs.
 %  * Full documentation is online at http://finufft.readthedocs.io
-
 function f = finufft1d1(x,c,isign,eps,ms,o)
 
 if nargin<6, o.dummy=1; end            % make a dummy options struct
diff --git a/matlab/finufft1d2.m b/matlab/finufft1d2.m
index 37f8f47cb..13e904bfc 100644
--- a/matlab/finufft1d2.m
+++ b/matlab/finufft1d2.m
@@ -42,7 +42,6 @@
 %  * For more details about the opts fields, see ../docs/opts.rst
 %  * See ERRHANDLER, VALID_* and FINUFFT_PLAN for possible warning/error IDs.
 %  * Full documentation is online at http://finufft.readthedocs.io
-
 function c = finufft1d2(x,isign,eps,f,o)
 
 if nargin<5, o.dummy=1; end
diff --git a/matlab/finufft1d3.m b/matlab/finufft1d3.m
index c1586e5ca..3d53bf324 100644
--- a/matlab/finufft1d3.m
+++ b/matlab/finufft1d3.m
@@ -40,7 +40,6 @@
 %  * For more details about the opts fields, see ../docs/opts.rst
 %  * See ERRHANDLER, VALID_* and FINUFFT_PLAN for possible warning/error IDs.
 %  * Full documentation is online at http://finufft.readthedocs.io
-
 function f = finufft1d3(x,c,isign,eps,s,o)
 
 if nargin<6, o.dummy=1; end
diff --git a/matlab/finufft2d1.m b/matlab/finufft2d1.m
index 76feb4d2d..2b1c874fb 100644
--- a/matlab/finufft2d1.m
+++ b/matlab/finufft2d1.m
@@ -48,7 +48,6 @@
 %  * For more details about the opts fields, see ../docs/opts.rst
 %  * See ERRHANDLER, VALID_* and FINUFFT_PLAN for possible warning/error IDs.
 %  * Full documentation is online at http://finufft.readthedocs.io
-
 function f = finufft2d1(x,y,c,isign,eps,ms,mt,o)
 
 if nargin<8, o.dummy=1; end
diff --git a/matlab/finufft2d2.m b/matlab/finufft2d2.m
index 3a3d2d768..1a48692c5 100644
--- a/matlab/finufft2d2.m
+++ b/matlab/finufft2d2.m
@@ -44,7 +44,6 @@
 %  * For more details about the opts fields, see ../docs/opts.rst
 %  * See ERRHANDLER, VALID_* and FINUFFT_PLAN for possible warning/error IDs.
 %  * Full documentation is online at http://finufft.readthedocs.io
-
 function c = finufft2d2(x,y,isign,eps,f,o)
 
 if nargin<6, o.dummy=1; end
diff --git a/matlab/finufft2d3.m b/matlab/finufft2d3.m
index daba6620e..8376e756c 100644
--- a/matlab/finufft2d3.m
+++ b/matlab/finufft2d3.m
@@ -41,7 +41,6 @@
 %  * For more details about the opts fields, see ../docs/opts.rst
 %  * See ERRHANDLER, VALID_* and FINUFFT_PLAN for possible warning/error IDs.
 %  * Full documentation is online at http://finufft.readthedocs.io
-
 function f = finufft2d3(x,y,c,isign,eps,s,t,o)
 
 if nargin<8, o.dummy=1; end
diff --git a/matlab/finufft3d1.m b/matlab/finufft3d1.m
index 71cf358cb..29505a901 100644
--- a/matlab/finufft3d1.m
+++ b/matlab/finufft3d1.m
@@ -50,7 +50,6 @@
 %  * For more details about the opts fields, see ../docs/opts.rst
 %  * See ERRHANDLER, VALID_* and FINUFFT_PLAN for possible warning/error IDs.
 %  * Full documentation is online at http://finufft.readthedocs.io
-
 function f = finufft3d1(x,y,z,c,isign,eps,ms,mt,mu,o)
 
 if nargin<10, o.dummy=1; end
diff --git a/matlab/finufft3d2.m b/matlab/finufft3d2.m
index ba1ab61f5..358a39de9 100644
--- a/matlab/finufft3d2.m
+++ b/matlab/finufft3d2.m
@@ -46,7 +46,6 @@
 %  * For more details about the opts fields, see ../docs/opts.rst
 %  * See ERRHANDLER, VALID_* and FINUFFT_PLAN for possible warning/error IDs.
 %  * Full documentation is online at http://finufft.readthedocs.io
-
 function c = finufft3d2(x,y,z,isign,eps,f,o)
 
 if nargin<7, o.dummy=1; end
diff --git a/matlab/finufft3d3.m b/matlab/finufft3d3.m
index a18129b7f..6e4b9d7b7 100644
--- a/matlab/finufft3d3.m
+++ b/matlab/finufft3d3.m
@@ -42,7 +42,6 @@
 %  * For more details about the opts fields, see ../docs/opts.rst
 %  * See ERRHANDLER, VALID_* and FINUFFT_PLAN for possible warning/error IDs.
 %  * Full documentation is online at http://finufft.readthedocs.io
-
 function f = finufft3d3(x,y,z,c,isign,eps,s,t,u,o)
 
 if nargin<10, o.dummy=1; end
@@ -52,4 +51,3 @@
 p = finufft_plan(3,3,isign,n_transf,eps,o);
 p.setpts(x,y,z,s,t,u);
 f = p.execute(c);
-
diff --git a/matlab/finufft_plan.docsrc b/matlab/finufft_plan.docsrc
index 80934d6f0..b167cb00f 100644
--- a/matlab/finufft_plan.docsrc
+++ b/matlab/finufft_plan.docsrc
@@ -34,7 +34,7 @@
 % Creates a finufft_plan MATLAB object in the guru interface to FINUFFT, of
 %  type 1, 2 or 3, and with given numbers of Fourier modes (unless type 3).
 %
-% Inputs: 
+% Inputs:
 %     type            transform type: 1, 2, or 3
 %     n_modes_or_dim  if type is 1 or 2, the number of Fourier modes in each
 %                     dimension: [ms] in 1D, [ms mt] in 2D, or [ms mt mu] in 3D.
@@ -138,4 +138,3 @@ OPTS12
 %  kernel Fourier transforms arrays, etc.
 %
 %
-
diff --git a/matlab/finufft_plan.m b/matlab/finufft_plan.m
index 6a841481c..b79c632e4 100644
--- a/matlab/finufft_plan.m
+++ b/matlab/finufft_plan.m
@@ -150,7 +150,6 @@
 %  kernel Fourier transforms arrays, etc.
 %
 %
-
 classdef finufft_plan < handle
 
   properties
diff --git a/matlab/notes.docbit b/matlab/notes.docbit
index 3a2af5b4c..2d8675344 100644
--- a/matlab/notes.docbit
+++ b/matlab/notes.docbit
@@ -7,4 +7,3 @@
 %  * For more details about the opts fields, see ../docs/opts.rst
 %  * See ERRHANDLER, VALID_* and FINUFFT_PLAN for possible warning/error IDs.
 %  * Full documentation is online at http://finufft.readthedocs.io
-
diff --git a/matlab/test/check_modeords.m b/matlab/test/check_modeords.m
index b9b954003..7282c0711 100644
--- a/matlab/test/check_modeords.m
+++ b/matlab/test/check_modeords.m
@@ -23,7 +23,7 @@
 f1 = finufft3d1(x,y,z,c,isign,eps,N1,N2,N3,o);
 f1 = fftshift(f1);  % handles odd dimension lengths Ns same as we do
 fprintf('\t modeord=1 vs 0: max error = %g\n',norm(f1(:)-f(:),Inf))
-% it's still a mystery how there can be any nonzero difference here - 
+% it's still a mystery how there can be any nonzero difference here -
 %  maybe the compiler is reordering the calcs, so roundoff appears, somehow?
 
 f = randn(N1,N2,N3)+1i*randn(N1,N2,N3);
diff --git a/matlab/test/fig_accuracy.m b/matlab/test/fig_accuracy.m
index 9a8ddac09..43ed36116 100644
--- a/matlab/test/fig_accuracy.m
+++ b/matlab/test/fig_accuracy.m
@@ -30,4 +30,3 @@
 axis tight; xlabel('tol'); ylabel('err');
 %title(sprintf('1d1: (maxerr)/||c||_1, M=%d, N=%d\n',M,N));
 title(sprintf('1d1: ||\tilde f - f||_2/||f||_2, M=%d, N=%d\n',M,N));
-
diff --git a/matlab/test/guru_setpts_issue.m b/matlab/test/guru_setpts_issue.m
index 47daa87ea..60ef13d77 100644
--- a/matlab/test/guru_setpts_issue.m
+++ b/matlab/test/guru_setpts_issue.m
@@ -36,7 +36,7 @@
     yy = -y;
     plan.setpts(xx, yy);
     vals3 = plan.execute(coeffs);
-    
+
     if ( any(isnan(vals2)) || norm(vals - vals2) > tol )
         warning('Something went wrong during run #%i', k);
         fprintf('norm(vals - vals2) = %g\n', norm(vals - vals2));
diff --git a/matlab/valid_setpts.m b/matlab/valid_setpts.m
index 0b0eef28a..3cea5179f 100644
--- a/matlab/valid_setpts.m
+++ b/matlab/valid_setpts.m
@@ -26,7 +26,7 @@
     if ~isvector(t), error('FINUFFT:badTshape','FINUFFT t must be a vector'); end
     if numel(t)~=nk, error('FINUFFT:badTlen','FINUFFT t must have same length as s'); end
   end
-end              
+end
 if dim>2
   if ~isvector(z), error('FINUFFT:badZshape','FINUFFT z must be a vector'); end
   if numel(z)~=nj, error('FINUFFT:badZlen','FINUFFT z must have same length as x'); end
@@ -34,4 +34,4 @@
     if ~isvector(u), error('FINUFFT:badUshape','FINUFFT u must be a vector'); end
     if numel(u)~=nk, error('FINUFFT:badUlen','FINUFFT u must have same length as s'); end
   end
-end   
+end
diff --git a/perftest/CMakeLists.txt b/perftest/CMakeLists.txt
index f0215a6df..e031cc807 100644
--- a/perftest/CMakeLists.txt
+++ b/perftest/CMakeLists.txt
@@ -2,26 +2,26 @@
 set(PERFTESTS guru_timing_test manysmallprobs spreadtestnd spreadtestndall)
 
 foreach(TEST ${PERFTESTS})
-  add_executable(${TEST} ${TEST}.cpp)
-  if(FINUFFT_USE_DUCC0)
-    target_compile_definitions(${TEST} PRIVATE -DFINUFFT_USE_DUCC0)
-  endif()
-  finufft_link_test(${TEST})
+    add_executable(${TEST} ${TEST}.cpp)
+    if(FINUFFT_USE_DUCC0)
+        target_compile_definitions(${TEST} PRIVATE -DFINUFFT_USE_DUCC0)
+    endif()
+    finufft_link_test(${TEST})
 
-  add_executable(${TEST}f ${TEST}.cpp)
-  target_compile_definitions(${TEST}f PRIVATE -DSINGLE)
-  if(FINUFFT_USE_DUCC0)
-    target_compile_definitions(${TEST}f PRIVATE -DFINUFFT_USE_DUCC0)
-  endif()
-  finufft_link_test(${TEST}f)
+    add_executable(${TEST}f ${TEST}.cpp)
+    target_compile_definitions(${TEST}f PRIVATE -DSINGLE)
+    if(FINUFFT_USE_DUCC0)
+        target_compile_definitions(${TEST}f PRIVATE -DFINUFFT_USE_DUCC0)
+    endif()
+    finufft_link_test(${TEST}f)
 endforeach()
 
 include(CheckIncludeFile)
 check_include_file("getopt.h" HAVE_GETOPT_H)
 if(HAVE_GETOPT_H)
-  add_executable(perftest perftest.cpp)
-  if(FINUFFT_USE_DUCC0)
-    target_compile_definitions(perftest PRIVATE -DFINUFFT_USE_DUCC0)
-  endif()
-  finufft_link_test(perftest)
+    add_executable(perftest perftest.cpp)
+    if(FINUFFT_USE_DUCC0)
+        target_compile_definitions(perftest PRIVATE -DFINUFFT_USE_DUCC0)
+    endif()
+    finufft_link_test(perftest)
 endif()
diff --git a/perftest/checkGuruTiming.sh b/perftest/checkGuruTiming.sh
index ba7395bb5..2ac2d1829 100755
--- a/perftest/checkGuruTiming.sh
+++ b/perftest/checkGuruTiming.sh
@@ -31,10 +31,8 @@ do
 	    modeNum2=${modes[index+1]}
 	    modeNum3=${modes[index+2]}
 
-	    echo "./guru_timing_test ${n_trials} ${type} ${dimension} ${modeNum1} ${modeNum2} ${modeNum3} ${srcpts} ${tolerance} ${debug}"				 
+	    echo "./guru_timing_test ${n_trials} ${type} ${dimension} ${modeNum1} ${modeNum2} ${modeNum3} ${srcpts} ${tolerance} ${debug}"
 	    ./guru_timing_test ${n_trials} ${type} ${dimension} ${modeNum1} ${modeNum2} ${modeNum3} ${srcpts} ${tolerance} ${debug}
 	done
     done
 done
-
-			       
diff --git a/perftest/compare_spreads.jl b/perftest/compare_spreads.jl
index a2b374821..658801496 100644
--- a/perftest/compare_spreads.jl
+++ b/perftest/compare_spreads.jl
@@ -15,7 +15,7 @@ function run_spread(repo,dim,M,N,tols,nthr,prec)
     if prec==Float64
         exec = "$repo/perftest/spreadtestnd"
     elseif prec==Float32
-        exec = "$repo/perftest/spreadtestndf"    
+        exec = "$repo/perftest/spreadtestndf"
     else error("prec not known!")
     end
     times = zeros(2,length(tols))     # spread col 1; interp col 2
@@ -82,9 +82,9 @@ if compute
         ts[:,i,ntolsd+1:end,2] = run_spread(repo2,dim,M,N,tolsf,nthr,Float32)
         println(ts[:,i,:,:])
     end
-    #tolstr = [[@sprintf "%.0e" tol for tol=tolsd]; [@sprintf "%.0ef" tol for tol=tolsf]] 
+    #tolstr = [[@sprintf "%.0e" tol for tol=tolsd]; [@sprintf "%.0ef" tol for tol=tolsf]]
     # strings for w (nspread) for plotting...
-    wstr = [[@sprintf "%d" -log10(tol)+1 for tol=tolsd]; [@sprintf "%df" -log10(tol)+1 for tol=tolsf]] 
+    wstr = [[@sprintf "%d" -log10(tol)+1 for tol=tolsd]; [@sprintf "%df" -log10(tol)+1 for tol=tolsf]]
     jldsave("$(fnam).jld2"; fnam,ts,wstr,dims,M,N,tolsd,tolsf,nthr)    # save all
     plot_all(fnam,ts,wstr,dims,M,N,nthr)
 else
diff --git a/perftest/cuda/CMakeLists.txt b/perftest/cuda/CMakeLists.txt
index 92c870cc5..fdf04e723 100644
--- a/perftest/cuda/CMakeLists.txt
+++ b/perftest/cuda/CMakeLists.txt
@@ -2,11 +2,12 @@ add_executable(cuperftest cuperftest.cu)
 target_include_directories(cuperftest PUBLIC ${CUFINUFFT_INCLUDE_DIRS})
 target_link_libraries(cuperftest PUBLIC cufinufft)
 target_compile_features(cuperftest PRIVATE cxx_std_17)
-target_compile_options(cuperftest
-                       PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:--extended-lambda>)
+target_compile_options(cuperftest PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:--extended-lambda>)
 set_target_properties(
-  cuperftest
-  PROPERTIES LINKER_LANGUAGE CUDA
-             CUDA_ARCHITECTURES "${FINUFFT_CUDA_ARCHITECTURES}"
-             CUDA_STANDARD 17
-             CUDA_STANDARD_REQUIRED ON)
+    cuperftest
+    PROPERTIES
+        LINKER_LANGUAGE CUDA
+        CUDA_ARCHITECTURES "${FINUFFT_CUDA_ARCHITECTURES}"
+        CUDA_STANDARD 17
+        CUDA_STANDARD_REQUIRED ON
+)
diff --git a/perftest/cuda/cuperftest.cu b/perftest/cuda/cuperftest.cu
index f72ffb3e6..777fd48ab 100644
--- a/perftest/cuda/cuperftest.cu
+++ b/perftest/cuda/cuperftest.cu
@@ -302,10 +302,10 @@ int main(int argc, char *argv[]) {
         std::cout << "Valid options:\n"
                      "    --prec <char>\n"
                      "           float or double precision. i.e. 'f' or 'd'\n"
-                     "           default: " << default_opts.prec << "\n" << 
+                     "           default: " << default_opts.prec << "\n" <<
                      "    --type <int>\n"
                      "           type of transform. 1 or 2\n"
-                     "           default: " << default_opts.type << "\n" << 
+                     "           default: " << default_opts.type << "\n" <<
                      "    --n_runs <int>\n"
                      "           number of runs to average performance over\n"
                      "           default: " << default_opts.n_runs << "\n" <<
diff --git a/perftest/mymaxthreads.sh b/perftest/mymaxthreads.sh
index 1f970f274..ab0b5fed3 100755
--- a/perftest/mymaxthreads.sh
+++ b/perftest/mymaxthreads.sh
@@ -15,7 +15,7 @@ case "${unameOut}" in
         # not sure this correct...
         echo "$NUMBER_OF_PROCESSORS"
         ;;
-    *)        
+    *)
 	echo "I'm in an unknown or unsupported operating system: ${unameOut}" >&2
         ;;
 esac
diff --git a/perftest/mynumcores.sh b/perftest/mynumcores.sh
index 962fa6d3f..780b37681 100755
--- a/perftest/mynumcores.sh
+++ b/perftest/mynumcores.sh
@@ -20,7 +20,7 @@ case "${unameOut}" in
         # not sure this is correct...
         echo "$NUMBER_OF_PROCESSORS"
         ;;
-    *)        
+    *)
 	echo "I'm in an unknown or unsupported operating system: ${unameOut}" >&2
         ;;
 esac
diff --git a/perftest/results/gcc_vs_icc_xeon.txt b/perftest/results/gcc_vs_icc_xeon.txt
index 212436671..ae3ebae8d 100644
--- a/perftest/results/gcc_vs_icc_xeon.txt
+++ b/perftest/results/gcc_vs_icc_xeon.txt
@@ -8,7 +8,7 @@ Timings in sec:
 single-thread:
 
 	t1		t2
-		
+
 	GCC	ICC	GCC	ICC
 
 1d	2.7	1.7	3.3	2.7
@@ -22,4 +22,3 @@ single-thread:
 3d	.82	.62	.76	.58
 
 Note: sorting a bit slower under ICC, but spreading is faster.
-
diff --git a/perftest/searchForTimeMetrics.py b/perftest/searchForTimeMetrics.py
index c7e450b70..983b38809 100644
--- a/perftest/searchForTimeMetrics.py
+++ b/perftest/searchForTimeMetrics.py
@@ -3,7 +3,7 @@
 
 decimalMatchString = "\d+\.?\d+" #regular expression to match a decimal number
 sciNotString = "(\d*.?\d*e-\d* s)" #regular expression to match a number in scientific notation
-wholeNumberMatchString = "\d+" 
+wholeNumberMatchString = "\d+"
 
 
 #search string needs to have two groupings! (one for everything besides) (time s)
@@ -25,12 +25,10 @@ def sumAllTime(searchString, strOut):
     lineMatch = re.findall(searchString,strOut)
     for match in lineMatch:
         val = re.search(sciNotString, match[1])
-        if(not val): #search failed, try decimal format 
+        if(not val): #search failed, try decimal format
             val = re.search(decimalMatchString, match[1])
         if(not val):
             val = re.search(wholeNumberMatchString, match[1])
         newVal = newVal + float(val.group(0).split('s')[0].strip()) #trim off " s"
     newVal = round(newVal,5)
     return newVal
-
-
diff --git a/perftest/spreadbenchmark.py b/perftest/spreadbenchmark.py
index 88f8ff30d..d9ed1328e 100644
--- a/perftest/spreadbenchmark.py
+++ b/perftest/spreadbenchmark.py
@@ -51,7 +51,7 @@ def runCommand(cmd):
     status, output = commands.getstatusoutput(cmd)
     print output
     return output
-    
+
 # Test runner
 def runTests():
     print "Running tests..."
@@ -70,7 +70,7 @@ def runTests():
             interp_speed = max(interp_speed, mi.group("speed"))
             interp_err = max(interp_err, mi.group("err"))
             spread_speed = max(spread_speed, ms.group("speed"))
-            spread_err = max(spread_err, ms.group("err"))            
+            spread_err = max(spread_err, ms.group("err"))
         results.append({"cmd":cmd,
                         "interp_speed":interp_speed,
                         "interp_err":interp_err,
@@ -90,7 +90,7 @@ def checkoutandmake(commit):
         runCommand("git checkout %s" % commit)
     print "Making..."
     print commands.getoutput("make clean test/spreadtestnd " + makeflags)
-    
+
 def restore(commit, home):
     if commit == "local":
         # We just tested local, so do nothing
@@ -98,16 +98,16 @@ def restore(commit, home):
     else:
         # Return home and pop stash
         print "Checking out %s and popping stash..." % home
-        runCommand("git checkout %s" % home)        
+        runCommand("git checkout %s" % home)
         runCommand("git stash pop")
-    
-# Run tests    
-print "* Testing %s" % commit_A    
+
+# Run tests
+print "* Testing %s" % commit_A
 checkoutandmake(commit_A)
 res_A = runTests()
 restore(commit_A, home)
 
-print "* Testing %s" % commit_B    
+print "* Testing %s" % commit_B
 checkoutandmake(commit_B)
 res_B = runTests()
 restore(commit_B, home)
@@ -132,10 +132,10 @@ def restore(commit, home):
 
     spread_speedup = float(res_B[idx]["spread_speed"]) / float(res_A[idx]["spread_speed"])*100 - 100
     spread_sign = "+" if spread_speedup > 0 else ""
-    spread_speedup = spread_sign + "%.1f" % spread_speedup + "%"    
-    interp_speedup = float(res_B[idx]["interp_speed"]) / float(res_A[idx]["interp_speed"])*100 - 100   
+    spread_speedup = spread_sign + "%.1f" % spread_speedup + "%"
+    interp_speedup = float(res_B[idx]["interp_speed"]) / float(res_A[idx]["interp_speed"])*100 - 100
     interp_sign = "+" if interp_speedup > 0 else ""
-    interp_speedup = interp_sign + "%.1f" % interp_speedup + "%"    
+    interp_speedup = interp_sign + "%.1f" % interp_speedup + "%"
     print format % ("", spread_speedup, interp_speedup, "", "")
 
     print ""
diff --git a/perftest/spreadingSchemeStats.py b/perftest/spreadingSchemeStats.py
index fce834f92..c6e9e754c 100644
--- a/perftest/spreadingSchemeStats.py
+++ b/perftest/spreadingSchemeStats.py
@@ -11,19 +11,19 @@
 n_trials = 1
 
 #Small problem, sequential Multithreading
-cmdStringOne_string = "./finufftGuru_test " + str(n_trials) + " "+ str(type) +" 3 1e1 1e1 1e1 1e4 1e-6 1 0" 
+cmdStringOne_string = "./finufftGuru_test " + str(n_trials) + " "+ str(type) +" 3 1e1 1e1 1e1 1e4 1e-6 1 0"
 cmdStringOne_desc = "Small Problem, sequential Multithreaded"
 
 #small problem, simulataneous single threaded, nested at end
-cmdStringTwo_string = "./finufftGuru_test " + str(n_trials) + " "+ str(type) +" 3 1e1 1e1 1e1 1e4 1e-6 1 1" 
+cmdStringTwo_string = "./finufftGuru_test " + str(n_trials) + " "+ str(type) +" 3 1e1 1e1 1e1 1e4 1e-6 1 1"
 cmdStringTwo_desc = "Small Problem, single threaded sort in parallel"
 
 #Large problem, sequential Multithreading
-cmdStringThree_string = "./finufftGuru_test " + str(n_trials) + " "+ str(type) +" 3 1e2 1e2 1e2 1e7 1e-6 1 0" 
+cmdStringThree_string = "./finufftGuru_test " + str(n_trials) + " "+ str(type) +" 3 1e2 1e2 1e2 1e7 1e-6 1 0"
 cmdStringThree_desc = "Large Problem, sequential Multithreaded"
 
 #Large problem, simulataneous single threaded, nested at end
-cmdStringFour_string = "./finufftGuru_test " + str(n_trials) + " "+ str(type) +" 3 1e2 1e2 1e2 1e7 1e-6 1 1" 
+cmdStringFour_string = "./finufftGuru_test " + str(n_trials) + " "+ str(type) +" 3 1e2 1e2 1e2 1e7 1e-6 1 1"
 cmdStringFour_desc = "Large Problem, single threaded sort in parallel"
 
 cmdStrings = [cmdStringOne_string, cmdStringTwo_string, cmdStringThree_string, cmdStringFour_string]
@@ -35,22 +35,22 @@
     print(cmddesc)
     print(cmdStrings[i])
     out =  subprocess.run([cmdString[0], cmdString[1], cmdString[2], cmdString[3],cmdString[4], cmdString[5],
-                           cmdString[6], cmdString[7], cmdString[8], cmdString[8], cmdString[10]], 
+                           cmdString[6], cmdString[7], cmdString[8], cmdString[8], cmdString[10]],
                       stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
-    
+
     strOut = out.stdout.decode() #convert bytes to string
     #print(strOut) #uncomment to see debug output
     ###############################################################################
     #Gather Spreading Stats
-    
-    #spread (old) / [sort+spread]  (new)            
+
+    #spread (old) / [sort+spread]  (new)
     newSort = stm.sumAllTime('(.*finufft_setpts.*sort)(.*)', strOut) #if type three, adds together both sorts
 
     #collect spreading if any
-    newSpread = stm.extractTime('(.*finufft.*exec.*spread)(.*)' , strOut) 
+    newSpread = stm.extractTime('(.*finufft.*exec.*spread)(.*)' , strOut)
 
     #collect interp if any
-    newInterp = stm.extractTime('(.*finufft.*exec.*interp)(.*)',strOut) 
+    newInterp = stm.extractTime('(.*finufft.*exec.*interp)(.*)',strOut)
 
     #collect the spread timings for each trial of old
     totalOldSpread = stm.sumAllTime('(.*spread.*ier)(.*)', strOut) #gets spread AND unspread (i.e. interpolation) for all trials
diff --git a/perftest/spreadtestnd.sh b/perftest/spreadtestnd.sh
index ffb36fde8..b12309b4f 100755
--- a/perftest/spreadtestnd.sh
+++ b/perftest/spreadtestnd.sh
@@ -23,7 +23,7 @@ else
     PREC=double
     ST=./spreadtestnd
 fi
-   
+
 echo
 export OMP_NUM_THREADS=$TESTTHREADS
 echo "$PREC-precision $OMP_NUM_THREADS-thread tests: #NU = $M, #U = $N, tol = $TOL..."
diff --git a/perftest/timingBreakdowns.py b/perftest/timingBreakdowns.py
index 3abf92e17..efcdad214 100644
--- a/perftest/timingBreakdowns.py
+++ b/perftest/timingBreakdowns.py
@@ -11,7 +11,7 @@
 '''
 A script to run a seris of finufftGuru_tests with varying parameters
 Captures the stdout, parses it for timing statistics, and graphs speedup
-ratio trends. 
+ratio trends.
 '''
 
 
@@ -26,7 +26,7 @@
 modes = [1e6,1,1,1e3,1e3,1,1e2,1e2,1e2]
 
 dimensions = [1,2,3]
-types = [1,2,3] 
+types = [1,2,3]
 #n_trials=[1,10,100]
 n_trials = [1,10]
 
@@ -84,20 +84,20 @@
 for dim in dimensions:
     for ftype in types:
         for trial in n_trials:
-            
-            print( "./finufftGuru_test "+ str(trial)+ " " + str(ftype)+  " " +str(dim)+ " " + 
-                                  str(modes[3*(dim-1)])+ " " + str(modes[3*(dim-1)+1])+ " " +  str(modes[3*(dim-1)+2])+ " " + 
-                                  str(M_srcpts)+  " " + str(tolerance) + " " + str(debug)); 
+
+            print( "./finufftGuru_test "+ str(trial)+ " " + str(ftype)+  " " +str(dim)+ " " +
+                                  str(modes[3*(dim-1)])+ " " + str(modes[3*(dim-1)+1])+ " " +  str(modes[3*(dim-1)+2])+ " " +
+                                  str(M_srcpts)+  " " + str(tolerance) + " " + str(debug));
             #execute the test for this set of parameters
-            out =  subprocess.run(["./finufftGuru_test", str(trial), str(ftype), str(dim), 
-                                  str(modes[3*(dim-1)]),str(modes[3*(dim-1)+1]),  str(modes[3*(dim-1)+2]), 
-                                  str(M_srcpts), str(tolerance), str(debug)], 
+            out =  subprocess.run(["./finufftGuru_test", str(trial), str(ftype), str(dim),
+                                  str(modes[3*(dim-1)]),str(modes[3*(dim-1)+1]),  str(modes[3*(dim-1)+2]),
+                                  str(M_srcpts), str(tolerance), str(debug)],
                                  stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
 
             strOut = out.stdout.decode() #convert bytes to string
             print(strOut)
 
-            
+
             #parse the output and syphon into data arrays
 
             ###############################################################################
@@ -133,12 +133,12 @@
                 totalTimeT3Ratio.append(totalSpeedup)
 
             ###############################################################################
-                
-            #spread (old) / [sort+spread]  (new)            
+
+            #spread (old) / [sort+spread]  (new)
             newSort = stm.sumAllTime('(.*finufft_setpts.*sort)(.*)', strOut)
 
             #collect spreading if any
-            newSpread = stm.extractTime('(.*finufft.*exec.*spread)(.*)' , strOut) 
+            newSpread = stm.extractTime('(.*finufft.*exec.*spread)(.*)' , strOut)
 
             #collect interp if any
             newInterp = stm.extractTime('(.*finufft.*exec.*interp)(.*)',strOut)
@@ -166,7 +166,7 @@
             #fftw_plan(old) / fftw_plan(new)
             planSciNotString = '(\(\d+\)[ \t]+)(\d*.?\d*e-\d* s)'
             planDecimalMatchString= '(\(\d+\)[ \t]+)(\d*\.?\d* s)'
-            planWholeNumberMatchString= '(\(\d+\)[ \t]+)(\d+ s)' 
+            planWholeNumberMatchString= '(\(\d+\)[ \t]+)(\d+ s)'
 
             #collect new fftw_plan time
             new_fftwPlan=0
@@ -178,12 +178,12 @@
                 if(not fftwPlanVal):
                     fftwPlanVal = re.search(wholeNumberMatchString, lineMatch.group(0))
                 new_fftwPlan = float(fftwPlanVal.group(2).split('s')[0])  #strip off s
-            new_fftwPlan = round(new_fftwPlan,5)    
+            new_fftwPlan = round(new_fftwPlan,5)
 
             #collect the fftw_plan timings for each trial of old
             isInitial = True
             initialLookup = 0
-            totalOldfftwPlan=0   
+            totalOldfftwPlan=0
             lineMatch = re.findall('(?<!\[make plan\] )fftw plan \(64\).*', strOut) #all fftw plan lines that don't include "make plan" indicating old implm.
             if(lineMatch):
                 for match in lineMatch:
@@ -196,13 +196,13 @@
                     if(isInitial): #Capture the first fftwplan output - indicating initial construction time
                         initalLookup = oldfftwPlanVal
                         isInitial = False
-                    
+
                     totalOldfftwPlan = totalOldfftwPlan + oldfftwPlanVal
             totalOldfftwPlan = round(totalOldfftwPlan,5)
-            
+
             #These plan ratios include the initial old implementation plan construction!!
             fftwPlanRatio = round(totalOldfftwPlan/new_fftwPlan,5)
-            
+
             if(ftype == 1):
                 fftwPlanT1Ratio.append(fftwPlanRatio)
                 fftwPlanT1_New.append(new_fftwPlan)
@@ -218,7 +218,7 @@
                 fftwPlanT3_New.append(new_fftwPlan)
                 fftwPlanT3_Old.append(totalOldfftwPlan)
                 fftwPlanT3_Old_initial.append(initialLookup)
-            
+
             ###############################################################################
             #fftw_exec(old) / fftw_exec(new)
 
@@ -226,10 +226,10 @@
             new_fft = stm.extractTime("(.*finufft_exec.*fft)(.*)" , strOut)
 
             #collect the fftw_exec timings for each trial of old
-            totalOldfft = stm.sumAllTime("(.*fft \(\d+ threads\))(.*)",strOut) 
+            totalOldfft = stm.sumAllTime("(.*fft \(\d+ threads\))(.*)",strOut)
 
             fftRatio = round(totalOldfft/new_fft,5)
-            
+
             if(ftype == 1):
                 fftT1Ratio.append(fftRatio)
                 fftT1_New.append(new_fft)
@@ -243,11 +243,11 @@
                 fftT3_New.append(new_fft)
                 fftT3_Old.append(totalOldfft)
 
-            
+
             ###############################################################################
 
 
-#Construct the bar graph 
+#Construct the bar graph
 barWidth = 0.25
 barDepth = 0.25
 
@@ -255,7 +255,7 @@
 _t2ys = [y + barWidth for y in _t1ys]
 _t3ys = [y + barWidth for y in _t2ys]
 
-_t1xs = n_trials 
+_t1xs = n_trials
 _t2xs = [x + barDepth for x in _t1xs]
 _t3xs = [x + barDepth for x in _t2xs]
 
@@ -272,7 +272,7 @@
 print(t1x)
 print("dimensions: ")
 print(t1y)
-    
+
 zbot = np.zeros(len(t1x))
 widths = [barWidth]*len(t1x)
 depths = [barDepth]*len(t1x)
@@ -285,7 +285,7 @@
 
 
 ##################TotalSpeed BAR GRAPH####################################################
-print("##################Total Time####################################################") 
+print("##################Total Time####################################################")
 print("\n")
 print("Raw T1 Total Time New " + str(totalTimeT1_New))
 print("Raw T1 Total Time Old " + str(totalTimeT1_Old))
@@ -316,7 +316,7 @@
 plt.title('totalOldTime/totalNewTime')
 
 
-#### Speed Statistics SANS Initial Planning Time 
+#### Speed Statistics SANS Initial Planning Time
 
 TotalSpeedRatioSansPlanT1 = (np.array(totalTimeT1_Old) - np.array(fftwPlanT1_Old_initial))/(np.array(totalTimeT1_New) - np.array(fftwPlanT1_New))
 TotalSpeedRatioSansPlanT2 = (np.array(totalTimeT2_Old) - np.array(fftwPlanT2_Old_initial))/(np.array(totalTimeT2_New) - np.array(fftwPlanT2_New))
@@ -334,7 +334,7 @@
 
 
 ##################SPREADING BAR GRAPH####################################################
-print("##################SPREADING####################################################") 
+print("##################SPREADING####################################################")
 print("\n")
 print("Raw T1 Spreading New" + str(spreadT1_New))
 print("Raw T1 Spreading Old" + str(spreadT1_Old))
@@ -435,15 +435,3 @@
 plt.show()
 
 fig.savefig('timing_breakdowns_rusty_node.png')
-
-
-
-
-
-
-
-
-
-
-
-
diff --git a/perftest/timingResults/timingBreakdowns_largeProblems.out b/perftest/timingResults/timingBreakdowns_largeProblems.out
index 013bad58d..95318846f 100644
--- a/perftest/timingResults/timingBreakdowns_largeProblems.out
+++ b/perftest/timingResults/timingBreakdowns_largeProblems.out
@@ -2,7 +2,7 @@ M = 1e7
 modes = [1e6,1,1,1e3,1e3,1,1e2,1e2,1e2]
 n_trials:
 [  1  10 100   1  10 100   1  10 100]
-dimensions: 
+dimensions:
 [1 1 1 2 2 2 3 3 3]
 
 ##################Total Time####################################################
diff --git a/perftest/timingResults/timingBreakdowns_smallProblems_SequentialMulti.out b/perftest/timingResults/timingBreakdowns_smallProblems_SequentialMulti.out
index 17899ae54..f4f0b7a19 100644
--- a/perftest/timingResults/timingBreakdowns_smallProblems_SequentialMulti.out
+++ b/perftest/timingResults/timingBreakdowns_smallProblems_SequentialMulti.out
@@ -4,7 +4,7 @@ modes = [1e4,1,1,1e2,1e2,1,1e1,1e1,1e1]
 n_trials:
 [  1  10 100   1  10 100   1  10 100]
 
-dimensions: 
+dimensions:
 [1 1 1 2 2 2 3 3 3]
 
 ##################Total Time####################################################
@@ -82,4 +82,3 @@ FFT Exec T2Ratio [0.54878, 3.06618, 2.516, 0.45333, 1.93548, 1.92403, 0.2029, 4.
 FFT Exec T3 New [0.00197, 0.00941, 0.00457, 0.00211, 0.00909, 0.021, 0.00044, 0.0033, 0.00909]
 FFT Exec T3 Old [0.00168, 0.01421, 0.13068, 0.00178, 0.01478, 0.13562, 0.00053, 0.00251, 0.03387]
 FFT Exec T3Ratio [0.85279, 1.5101, 28.59519, 0.8436, 1.62596, 6.4581, 1.20455, 0.76061, 3.72607]
-
diff --git a/perftest/timingResults/timingBreakdowns_smallProblems_SequentialMulti_noSwitch.out b/perftest/timingResults/timingBreakdowns_smallProblems_SequentialMulti_noSwitch.out
index b8b03cce8..19a7e11af 100644
--- a/perftest/timingResults/timingBreakdowns_smallProblems_SequentialMulti_noSwitch.out
+++ b/perftest/timingResults/timingBreakdowns_smallProblems_SequentialMulti_noSwitch.out
@@ -3,7 +3,7 @@ modes = [1e4,1,1,1e2,1e2,1,1e1,1e1,1e1]
 
 n_trials:
 [  1  10 100   1  10 100   1  10 100]
-dimensions: 
+dimensions:
 [1 1 1 2 2 2 3 3 3]
 
 
diff --git a/perftest/timingResults/timingBreakdowns_smallProblems_SimultaneousSingle.out b/perftest/timingResults/timingBreakdowns_smallProblems_SimultaneousSingle.out
index f23ac3758..3981fa1bb 100644
--- a/perftest/timingResults/timingBreakdowns_smallProblems_SimultaneousSingle.out
+++ b/perftest/timingResults/timingBreakdowns_smallProblems_SimultaneousSingle.out
@@ -3,7 +3,7 @@ modes = [1e4,1,1,1e2,1e2,1,1e1,1e1,1e1]
 
 n_trials:
 [  1  10 100   1  10 100   1  10 100]
-dimensions: 
+dimensions:
 [1 1 1 2 2 2 3 3 3]
 ##################Total Time####################################################
 
@@ -80,4 +80,3 @@ FFT Exec T2Ratio [0.5, 3.09023, 2.1695, 2.10227, 5.31414, 1.87917, 0.18421, 4.09
 FFT Exec T3 New [0.00204, 0.00467, 0.00524, 0.00204, 0.0113, 0.0118, 0.0102, 0.00595, 0.00822]
 FFT Exec T3 Old [0.00169, 0.01488, 0.05617, 0.00188, 0.01403, 0.11084, 0.00103, 0.00705, 0.03543]
 FFT Exec T3Ratio [0.82843, 3.1863, 10.71947, 0.92157, 1.24159, 9.39322, 0.10098, 1.18487, 4.31022]
-
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 862668da2..ab71b53da 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -1,23 +1,20 @@
 if(FINUFFT_USE_CPU)
-  if(WIN32)
-    install(
-      TARGETS finufft
-      LIBRARY DESTINATION finufft
-      RUNTIME DESTINATION finufft)
-  else()
-    install(TARGETS finufft LIBRARY DESTINATION finufft)
-  endif()
+    if(WIN32)
+        install(TARGETS finufft LIBRARY DESTINATION finufft RUNTIME DESTINATION finufft)
+    else()
+        install(TARGETS finufft LIBRARY DESTINATION finufft)
+    endif()
 endif()
 
 if(FINUFFT_USE_CUDA)
-  install(TARGETS cufinufft LIBRARY DESTINATION cufinufft)
+    install(TARGETS cufinufft LIBRARY DESTINATION cufinufft)
 endif()
 
 # Warn if the user invokes CMake directly
 if(NOT SKBUILD)
-  message(
-    WARNING
-      "\
+    message(
+        WARNING
+        "\
   This CMake file is meant to be executed using 'scikit-build-core'.
   Running it directly will almost certainly not produce the desired
   result. If you are a user trying to install this package, use the
@@ -38,5 +35,6 @@ if(NOT SKBUILD)
   =====================================================================
   You may optionally add -Ceditable.rebuild=true to auto-rebuild when
   the package is imported. Otherwise, you need to rerun the above
-  after editing C++ files.")
+  after editing C++ files."
+    )
 endif()
diff --git a/python/finufft/test/accuracy_speed_tests.py b/python/finufft/test/accuracy_speed_tests.py
index 78de3d7dd..ed39cf083 100644
--- a/python/finufft/test/accuracy_speed_tests.py
+++ b/python/finufft/test/accuracy_speed_tests.py
@@ -137,7 +137,7 @@ def accuracy_speed_tests(num_nonuniform_points,num_uniform_points,eps):
 		Xest[ii]=np.sum(fk[dtest,:,:] * np.exp(1j*(Ks*xj[ii]+Kt*yj[ii])))
 		Xtrue[ii]=cj[dtest,ii]
 	print_report('finufft2d2many',elapsed,Xest,Xtrue,ndata*nj)
-	
+
 	# 2d3
 	x=np.random.rand(nj)*2*math.pi-math.pi
 	y=np.random.rand(nj)*2*math.pi-math.pi
diff --git a/src/cuda/1d/README b/src/cuda/1d/README
index f33ece6cc..44f70727f 100644
--- a/src/cuda/1d/README
+++ b/src/cuda/1d/README
@@ -5,10 +5,10 @@
   This file contains all the GPU kernels for 1d spreading, interpolation.
 
 - interp1d_wrapper.cu
-  Wrappers for 1d interpolations. One method is implemented: 
-    (1) nonuniform driven, 
-    
+  Wrappers for 1d interpolations. One method is implemented:
+    (1) nonuniform driven,
+
 - spread1d_wrapper.cu
-  Wrappers for 1d spreading. Two methods are implemented: 
-    (1) nonuniform driven, 
-    (2) subproblem 
+  Wrappers for 1d spreading. Two methods are implemented:
+    (1) nonuniform driven,
+    (2) subproblem
diff --git a/src/cuda/2d/README b/src/cuda/2d/README
index f5f45f8ff..335689cbf 100644
--- a/src/cuda/2d/README
+++ b/src/cuda/2d/README
@@ -5,11 +5,11 @@
   This file contains all the GPU kernels for 2d spreading, interpolation.
 
 - interp2d_wrapper.cu
-  Wrappers for 2d interpolations. Two methods are implemented: 
-    (1) nonuniform driven, 
+  Wrappers for 2d interpolations. Two methods are implemented:
+    (1) nonuniform driven,
     (2) subproblem
-    
+
 - spread2d_wrapper.cu
-  Wrappers for 2d spreading. Three methods are implemented: 
-    (1) nonuniform driven, 
-    (2) subproblem, 
+  Wrappers for 2d spreading. Three methods are implemented:
+    (1) nonuniform driven,
+    (2) subproblem,
diff --git a/src/cuda/3d/README b/src/cuda/3d/README
index 9c942945f..e1d60542e 100644
--- a/src/cuda/3d/README
+++ b/src/cuda/3d/README
@@ -5,12 +5,12 @@
   This file contains all the GPU kernels for 3d spreading, interpolation.
 
 - interp3d_wrapper.cu
-  Wrappers for 3d interpolations. Two methods are implemented: 
-    (1) nonuniform driven, 
+  Wrappers for 3d interpolations. Two methods are implemented:
+    (1) nonuniform driven,
     (2) subproblem
-    
+
 - spread3d_wrapper.cu
-  Wrappers for 3d spreading. Three methods are implemented: 
-    (1) nonuniform points driven, 
-    (2) subproblem, 
-    (4) block gather 
+  Wrappers for 3d spreading. Three methods are implemented:
+    (1) nonuniform points driven,
+    (2) subproblem,
+    (4) block gather
diff --git a/src/cuda/CMakeLists.txt b/src/cuda/CMakeLists.txt
index 9136f6099..eccebb17a 100644
--- a/src/cuda/CMakeLists.txt
+++ b/src/cuda/CMakeLists.txt
@@ -1,6 +1,4 @@
-set(PRECISION_INDEPENDENT_SRC
-    precision_independent.cu utils.cpp
-    ${PROJECT_SOURCE_DIR}/contrib/legendre_rule_fast.cpp)
+set(PRECISION_INDEPENDENT_SRC precision_independent.cu utils.cpp ${PROJECT_SOURCE_DIR}/contrib/legendre_rule_fast.cpp)
 
 set(PRECISION_DEPENDENT_SRC
     spreadinterp.cpp
@@ -16,16 +14,17 @@ set(PRECISION_DEPENDENT_SRC
     memtransfer_wrapper.cu
     deconvolve_wrapper.cu
     cufinufft.cu
-    common.cu)
+    common.cu
+)
 
 set(CUFINUFFT_INCLUDE_DIRS
-    ${PROJECT_SOURCE_DIR}/include ${PROJECT_SOURCE_DIR}/contrib
+    ${PROJECT_SOURCE_DIR}/include
+    ${PROJECT_SOURCE_DIR}/contrib
     $<TARGET_PROPERTY:CUDA::cudart,INTERFACE_INCLUDE_DIRECTORIES>
-    $<TARGET_PROPERTY:CUDA::cufft,INTERFACE_INCLUDE_DIRECTORIES>)
+    $<TARGET_PROPERTY:CUDA::cufft,INTERFACE_INCLUDE_DIRECTORIES>
+)
 
-set(CUFINUFFT_INCLUDE_DIRS
-    ${CUFINUFFT_INCLUDE_DIRS}
-    PARENT_SCOPE)
+set(CUFINUFFT_INCLUDE_DIRS ${CUFINUFFT_INCLUDE_DIRS} PARENT_SCOPE)
 
 # flush denormals to zero and enable verbose PTXAS output
 set(FINUFFT_CUDA_FLAGS
@@ -39,39 +38,36 @@ set(FINUFFT_CUDA_FLAGS
     -maxrregcount
     64
     >
-    >)
+    >
+)
 
 if(FINUFFT_SHARED_LINKING)
-  add_library(cufinufft SHARED ${PRECISION_INDEPENDENT_SRC}
-                               ${PRECISION_DEPENDENT_SRC})
+    add_library(cufinufft SHARED ${PRECISION_INDEPENDENT_SRC} ${PRECISION_DEPENDENT_SRC})
 else()
-  add_library(cufinufft STATIC ${PRECISION_INDEPENDENT_SRC}
-                               ${PRECISION_DEPENDENT_SRC})
-  set_target_properties(
-    cufinufft PROPERTIES POSITION_INDEPENDENT_CODE
-                         ${FINUFFT_POSITION_INDEPENDENT_CODE})
+    add_library(cufinufft STATIC ${PRECISION_INDEPENDENT_SRC} ${PRECISION_DEPENDENT_SRC})
+    set_target_properties(cufinufft PROPERTIES POSITION_INDEPENDENT_CODE ${FINUFFT_POSITION_INDEPENDENT_CODE})
 endif()
 target_include_directories(cufinufft PUBLIC ${CUFINUFFT_INCLUDE_DIRS})
 # set target build location
-set_target_properties(cufinufft PROPERTIES LIBRARY_OUTPUT_DIRECTORY
-                                           "${PROJECT_BINARY_DIR}")
+set_target_properties(cufinufft PROPERTIES LIBRARY_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}")
 
 set_target_properties(
-  cufinufft
-  PROPERTIES CUDA_ARCHITECTURES "${FINUFFT_CUDA_ARCHITECTURES}"
-             CUDA_SEPARABLE_COMPILATION ON
-             CUDA_STANDARD 17
-             CUDA_STANDARD_REQUIRED ON
-             WINDOWS_EXPORT_ALL_SYMBOLS ON
-             ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}")
+    cufinufft
+    PROPERTIES
+        CUDA_ARCHITECTURES "${FINUFFT_CUDA_ARCHITECTURES}"
+        CUDA_SEPARABLE_COMPILATION ON
+        CUDA_STANDARD 17
+        CUDA_STANDARD_REQUIRED ON
+        WINDOWS_EXPORT_ALL_SYMBOLS ON
+        ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
+)
 target_compile_features(cufinufft PRIVATE cxx_std_17)
 target_compile_options(cufinufft PRIVATE ${FINUFFT_CUDA_FLAGS})
 if(WIN32 OR (BUILD_TESTING AND FINUFFT_BUILD_TESTS))
-  target_link_libraries(cufinufft PUBLIC CUDA::cudart CUDA::cufft)
+    target_link_libraries(cufinufft PUBLIC CUDA::cudart CUDA::cufft)
 else()
-  target_link_libraries(cufinufft PUBLIC CUDA::cudart_static CUDA::cufft_static)
+    target_link_libraries(cufinufft PUBLIC CUDA::cudart_static CUDA::cufft_static)
 endif()
 
 file(GLOB CUFINUFFT_PUBLIC_HEADERS "${CMAKE_SOURCE_DIR}/include/cufinufft*.h")
-set_target_properties(cufinufft PROPERTIES PUBLIC_HEADER
-                                           "${CUFINUFFT_PUBLIC_HEADERS}")
+set_target_properties(cufinufft PROPERTIES PUBLIC_HEADER "${CUFINUFFT_PUBLIC_HEADERS}")
diff --git a/src/cuda/README b/src/cuda/README
index 5b3166d81..dc4189482 100644
--- a/src/cuda/README
+++ b/src/cuda/README
@@ -18,7 +18,7 @@ the Flatiron Institute, advised by Alex Barnett.
 This folder contains the main source files of the GPU implementations.
 
 - cufinufft.cu
-  Four main stages of cufinufft API. 
+  Four main stages of cufinufft API.
   (1) cufinufft_makeplan, (2) cufinufft_setpts, (3) cufinufft_execute, (4) cufinufft_destroy.
   Also, cufinufft_default_opts may precede stage 1.
 
diff --git a/src/cuda/memtransfer_wrapper.cu b/src/cuda/memtransfer_wrapper.cu
index 20bbb6355..2b46b3908 100644
--- a/src/cuda/memtransfer_wrapper.cu
+++ b/src/cuda/memtransfer_wrapper.cu
@@ -434,13 +434,14 @@ void freegpumemory(cufinufft_plan_t<T> *d_plan)
   utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id);
   // Fixes a crash whewre the plan itself is deleted before the stream
   const auto stream = d_plan->stream;
-  // Dont clear fw if spreadinterponly for type 1 and 2 as fw belongs to original program (it is d_fk)
-  if(!d_plan->opts.gpu_spreadinterponly || d_plan->type == 3)
+  // Dont clear fw if spreadinterponly for type 1 and 2 as fw belongs to original program
+  // (it is d_fk)
+  if (!d_plan->opts.gpu_spreadinterponly || d_plan->type == 3)
     CUDA_FREE_AND_NULL(d_plan->fw, stream, d_plan->supports_pools);
   CUDA_FREE_AND_NULL(d_plan->fwkerhalf1, stream, d_plan->supports_pools);
   CUDA_FREE_AND_NULL(d_plan->fwkerhalf2, stream, d_plan->supports_pools);
   CUDA_FREE_AND_NULL(d_plan->fwkerhalf3, stream, d_plan->supports_pools);
-  
+
   CUDA_FREE_AND_NULL(d_plan->idxnupts, stream, d_plan->supports_pools);
   CUDA_FREE_AND_NULL(d_plan->sortidx, stream, d_plan->supports_pools);
   CUDA_FREE_AND_NULL(d_plan->numsubprob, stream, d_plan->supports_pools);
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 68979f3e5..71b70ef3c 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -34,22 +34,14 @@ if(FINUFFT_USE_DUCC0)
 endif()
 target_compile_features(testutils PRIVATE cxx_std_17)
 finufft_link_test(testutils)
-add_test(
-    NAME run_testutils
-    COMMAND testutils
-    WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
-)
+add_test(NAME run_testutils COMMAND testutils WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
 
 if(NOT FINUFFT_USE_DUCC0)
     add_executable(fftw_lock_test fftw_lock_test.cpp)
     target_compile_features(fftw_lock_test PRIVATE cxx_std_17)
     finufft_link_test(fftw_lock_test)
 
-    add_test(
-        NAME run_fftw_lock_test
-        COMMAND fftw_lock_test
-        WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
-    )
+    add_test(NAME run_fftw_lock_test COMMAND fftw_lock_test WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
 endif()
 
 # Add ctest definitions that run at both precisions...
@@ -58,11 +50,7 @@ function(add_tests_with_prec PREC REQ_TOL CHECK_TOL SUFFIX)
     # as in makefile. This prevents them taking a huge time on a, say, 128-core
     # Rome node. ... but I don't know platform-indep way to do that! Does anyone?
 
-    add_test(
-        NAME run_basic_pass_fail_${PREC}
-        COMMAND basicpassfail${SUFFIX}
-        WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
-    )
+    add_test(NAME run_basic_pass_fail_${PREC} COMMAND basicpassfail${SUFFIX} WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
 
     add_test(
         NAME run_finufft1d_test_${PREC}
@@ -72,52 +60,39 @@ function(add_tests_with_prec PREC REQ_TOL CHECK_TOL SUFFIX)
 
     add_test(
         NAME run_finufft1dmany_test_${PREC}
-        COMMAND
-            finufft1dmany_test${SUFFIX} 3 1e2 2e2 ${REQ_TOL} 0 0 0 2 0.0
-            ${CHECK_TOL}
+        COMMAND finufft1dmany_test${SUFFIX} 3 1e2 2e2 ${REQ_TOL} 0 0 0 2 0.0 ${CHECK_TOL}
         WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
     )
 
     add_test(
         NAME run_finufft2d_test_${PREC}
-        COMMAND
-            finufft2d_test${SUFFIX} 1e2 1e1 1e3 ${REQ_TOL} 0 2 0.0 ${CHECK_TOL}
+        COMMAND finufft2d_test${SUFFIX} 1e2 1e1 1e3 ${REQ_TOL} 0 2 0.0 ${CHECK_TOL}
         WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
     )
 
     add_test(
         NAME run_finufft2dmany_test_${PREC}
-        COMMAND
-            finufft2dmany_test${SUFFIX} 3 1e2 1e1 1e3 ${REQ_TOL} 0 0 0 2 0.0
-            ${CHECK_TOL}
+        COMMAND finufft2dmany_test${SUFFIX} 3 1e2 1e1 1e3 ${REQ_TOL} 0 0 0 2 0.0 ${CHECK_TOL}
         WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
     )
 
     add_test(
         NAME run_finufft3d_test_${PREC}
-        COMMAND
-            finufft3d_test${SUFFIX} 5 10 20 1e2 ${REQ_TOL} 0 2 0.0 ${CHECK_TOL}
+        COMMAND finufft3d_test${SUFFIX} 5 10 20 1e2 ${REQ_TOL} 0 2 0.0 ${CHECK_TOL}
         WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
     )
 
     add_test(
         NAME run_finufft3dmany_test_${PREC}
-        COMMAND
-            finufft3dmany_test${SUFFIX} 2 5 10 20 1e2 ${REQ_TOL} 0 0 0 2 0.0
-            ${CHECK_TOL}
+        COMMAND finufft3dmany_test${SUFFIX} 2 5 10 20 1e2 ${REQ_TOL} 0 0 0 2 0.0 ${CHECK_TOL}
         WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
     )
 
-    add_test(
-        NAME run_dumbinputs_${PREC}
-        COMMAND dumbinputs${SUFFIX}
-        WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
-    )
+    add_test(NAME run_dumbinputs_${PREC} COMMAND dumbinputs${SUFFIX} WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
 
     add_test(
         NAME spreadinterp1d_${PREC}
-        COMMAND
-            spreadinterp1d_test${SUFFIX} 1e3 1e3 ${REQ_TOL} 0 2 2.0 ${CHECK_TOL}
+        COMMAND spreadinterp1d_test${SUFFIX} 1e3 1e3 ${REQ_TOL} 0 2 2.0 ${CHECK_TOL}
         WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
     )
 endfunction()
diff --git a/test/cuda/CMakeLists.txt b/test/cuda/CMakeLists.txt
index 2aac08711..01de47cc4 100644
--- a/test/cuda/CMakeLists.txt
+++ b/test/cuda/CMakeLists.txt
@@ -5,10 +5,7 @@ foreach(srcfile ${test_src})
     get_filename_component(executable ${executable} NAME)
     add_executable(${executable} ${srcfile})
     target_include_directories(${executable} PUBLIC ${CUFINUFFT_INCLUDE_DIRS})
-    target_compile_options(
-        ${executable}
-        PUBLIC $<$<COMPILE_LANGUAGE:CUDA>:--extended-lambda>
-    )
+    target_compile_options(${executable} PUBLIC $<$<COMPILE_LANGUAGE:CUDA>:--extended-lambda>)
     find_library(MathLib m)
     if(MathLib)
         target_link_libraries(${executable} PUBLIC cufinufft ${MathLib})
@@ -16,9 +13,7 @@ foreach(srcfile ${test_src})
     target_compile_features(${executable} PUBLIC cxx_std_17)
     set_target_properties(
         ${executable}
-        PROPERTIES
-            LINKER_LANGUAGE CUDA
-            CUDA_ARCHITECTURES "${FINUFFT_CUDA_ARCHITECTURES}"
+        PROPERTIES LINKER_LANGUAGE CUDA CUDA_ARCHITECTURES "${FINUFFT_CUDA_ARCHITECTURES}"
     )
     message(
         STATUS
@@ -31,214 +26,154 @@ endforeach()
 function(add_tests PREC REQ_TOL CHECK_TOL UPSAMP)
     add_test(
         NAME cufinufft1d1_test_auto_${PREC}_${UPSAMP}
-        COMMAND
-            cufinufft1d_test 0 1 2e3 4e3 ${REQ_TOL} ${CHECK_TOL} ${PREC}
-            ${UPSAMP}
+        COMMAND cufinufft1d_test 0 1 2e3 4e3 ${REQ_TOL} ${CHECK_TOL} ${PREC} ${UPSAMP}
     )
 
     add_test(
         NAME cufinufft1d1_test_GM_${PREC}_${UPSAMP}
-        COMMAND
-            cufinufft1d_test 1 1 2e3 4e3 ${REQ_TOL} ${CHECK_TOL} ${PREC}
-            ${UPSAMP}
+        COMMAND cufinufft1d_test 1 1 2e3 4e3 ${REQ_TOL} ${CHECK_TOL} ${PREC} ${UPSAMP}
     )
 
     add_test(
         NAME cufinufft1d1_test_SM_${PREC}_${UPSAMP}
-        COMMAND
-            cufinufft1d_test 2 1 2e3 4e3 ${REQ_TOL} ${CHECK_TOL} ${PREC}
-            ${UPSAMP}
+        COMMAND cufinufft1d_test 2 1 2e3 4e3 ${REQ_TOL} ${CHECK_TOL} ${PREC} ${UPSAMP}
     )
 
     add_test(
         NAME cufinufft1d2_test_auto_${PREC}_${UPSAMP}
-        COMMAND
-            cufinufft1d_test 0 2 2e3 4e3 ${REQ_TOL} ${CHECK_TOL} ${PREC}
-            ${UPSAMP}
+        COMMAND cufinufft1d_test 0 2 2e3 4e3 ${REQ_TOL} ${CHECK_TOL} ${PREC} ${UPSAMP}
     )
 
     add_test(
         NAME cufinufft1d2_test_GM_${PREC}_${UPSAMP}
-        COMMAND
-            cufinufft1d_test 1 2 2e3 4e3 ${REQ_TOL} ${CHECK_TOL} ${PREC}
-            ${UPSAMP}
+        COMMAND cufinufft1d_test 1 2 2e3 4e3 ${REQ_TOL} ${CHECK_TOL} ${PREC} ${UPSAMP}
     )
 
     add_test(
         NAME cufinufft1d3_test_auto_${PREC}_${UPSAMP}
-        COMMAND
-            cufinufft1d_test 0 3 2e3 4e3 ${REQ_TOL} ${CHECK_TOL} ${PREC}
-            ${UPSAMP}
+        COMMAND cufinufft1d_test 0 3 2e3 4e3 ${REQ_TOL} ${CHECK_TOL} ${PREC} ${UPSAMP}
     )
 
     add_test(
         NAME cufinufft1d3_test_GM_${PREC}_${UPSAMP}
-        COMMAND
-            cufinufft1d_test 1 3 2e3 4e3 ${REQ_TOL} ${CHECK_TOL} ${PREC}
-            ${UPSAMP}
+        COMMAND cufinufft1d_test 1 3 2e3 4e3 ${REQ_TOL} ${CHECK_TOL} ${PREC} ${UPSAMP}
     )
 
     add_test(
         NAME cufinufft2d1_test_auto_${PREC}_${UPSAMP}
-        COMMAND
-            cufinufft2d_test 0 1 1e2 2e2 2e4 ${REQ_TOL} ${CHECK_TOL} ${PREC}
-            ${UPSAMP}
+        COMMAND cufinufft2d_test 0 1 1e2 2e2 2e4 ${REQ_TOL} ${CHECK_TOL} ${PREC} ${UPSAMP}
     )
 
     add_test(
         NAME cufinufft2d1_test_GM_${PREC}_${UPSAMP}
-        COMMAND
-            cufinufft2d_test 1 1 1e2 2e2 2e4 ${REQ_TOL} ${CHECK_TOL} ${PREC}
-            ${UPSAMP}
+        COMMAND cufinufft2d_test 1 1 1e2 2e2 2e4 ${REQ_TOL} ${CHECK_TOL} ${PREC} ${UPSAMP}
     )
 
     add_test(
         NAME cufinufft2d1_test_SM_${PREC}_${UPSAMP}
-        COMMAND
-            cufinufft2d_test 2 1 1e2 2e2 2e4 ${REQ_TOL} ${CHECK_TOL} ${PREC}
-            ${UPSAMP}
+        COMMAND cufinufft2d_test 2 1 1e2 2e2 2e4 ${REQ_TOL} ${CHECK_TOL} ${PREC} ${UPSAMP}
     )
 
     add_test(
         NAME cufinufft2d2_test_auto_${PREC}_${UPSAMP}
-        COMMAND
-            cufinufft2d_test 0 2 1e2 2e2 2e4 ${REQ_TOL} ${CHECK_TOL} ${PREC}
-            ${UPSAMP}
+        COMMAND cufinufft2d_test 0 2 1e2 2e2 2e4 ${REQ_TOL} ${CHECK_TOL} ${PREC} ${UPSAMP}
     )
 
     add_test(
         NAME cufinufft2d2_test_SM_${PREC}_${UPSAMP}
-        COMMAND
-            cufinufft2d_test 2 2 1e2 2e2 2e4 ${REQ_TOL} ${CHECK_TOL} ${PREC}
-            ${UPSAMP}
+        COMMAND cufinufft2d_test 2 2 1e2 2e2 2e4 ${REQ_TOL} ${CHECK_TOL} ${PREC} ${UPSAMP}
     )
 
     add_test(
         NAME cufinufft2d3_test_auto_${PREC}_${UPSAMP}
-        COMMAND
-            cufinufft2d_test 0 3 1e2 2e2 2e4 ${REQ_TOL} ${CHECK_TOL} ${PREC}
-            ${UPSAMP}
+        COMMAND cufinufft2d_test 0 3 1e2 2e2 2e4 ${REQ_TOL} ${CHECK_TOL} ${PREC} ${UPSAMP}
     )
 
     add_test(
         NAME cufinufft2d3_test_SM_${PREC}_${UPSAMP}
-        COMMAND
-            cufinufft2d_test 2 3 1e2 2e2 2e4 ${REQ_TOL} ${CHECK_TOL} ${PREC}
-            ${UPSAMP}
+        COMMAND cufinufft2d_test 2 3 1e2 2e2 2e4 ${REQ_TOL} ${CHECK_TOL} ${PREC} ${UPSAMP}
     )
 
     add_test(
         NAME cufinufft2d1many_test_GM_${PREC}_${UPSAMP}
-        COMMAND
-            cufinufft2dmany_test 1 1 1e2 2e2 5 0 2e4 ${REQ_TOL} ${CHECK_TOL}
-            ${PREC} ${UPSAMP}
+        COMMAND cufinufft2dmany_test 1 1 1e2 2e2 5 0 2e4 ${REQ_TOL} ${CHECK_TOL} ${PREC} ${UPSAMP}
     )
 
     add_test(
         NAME cufinufft2d1many_test_SM_${PREC}_${UPSAMP}
-        COMMAND
-            cufinufft2dmany_test 2 1 1e2 2e2 5 0 2e4 ${REQ_TOL} ${CHECK_TOL}
-            ${PREC} ${UPSAMP}
+        COMMAND cufinufft2dmany_test 2 1 1e2 2e2 5 0 2e4 ${REQ_TOL} ${CHECK_TOL} ${PREC} ${UPSAMP}
     )
 
     add_test(
         NAME cufinufft2d2many_test_GM_${PREC}_${UPSAMP}
-        COMMAND
-            cufinufft2dmany_test 1 2 1e2 2e2 5 0 2e4 ${REQ_TOL} ${CHECK_TOL}
-            ${PREC} ${UPSAMP}
+        COMMAND cufinufft2dmany_test 1 2 1e2 2e2 5 0 2e4 ${REQ_TOL} ${CHECK_TOL} ${PREC} ${UPSAMP}
     )
 
     add_test(
         NAME cufinufft2d2many_test_SM_${PREC}_${UPSAMP}
-        COMMAND
-            cufinufft2dmany_test 2 2 1e2 2e2 5 0 2e4 ${REQ_TOL} ${CHECK_TOL}
-            ${PREC} ${UPSAMP}
+        COMMAND cufinufft2dmany_test 2 2 1e2 2e2 5 0 2e4 ${REQ_TOL} ${CHECK_TOL} ${PREC} ${UPSAMP}
     )
 
     add_test(
         NAME cufinufft2d3many_test_GM_${PREC}_${UPSAMP}
-        COMMAND
-            cufinufft2dmany_test 1 3 1e2 2e2 5 0 2e4 ${REQ_TOL} ${CHECK_TOL}
-            ${PREC} ${UPSAMP}
+        COMMAND cufinufft2dmany_test 1 3 1e2 2e2 5 0 2e4 ${REQ_TOL} ${CHECK_TOL} ${PREC} ${UPSAMP}
     )
 
     add_test(
         NAME cufinufft2d3many_test_SM_${PREC}_${UPSAMP}
-        COMMAND
-            cufinufft2dmany_test 2 3 1e2 2e2 5 0 2e4 ${REQ_TOL} ${CHECK_TOL}
-            ${PREC} ${UPSAMP}
+        COMMAND cufinufft2dmany_test 2 3 1e2 2e2 5 0 2e4 ${REQ_TOL} ${CHECK_TOL} ${PREC} ${UPSAMP}
     )
 
     add_test(
         NAME cufinufft3d1_test_auto_${PREC}_${UPSAMP}
-        COMMAND
-            cufinufft3d_test 0 1 2 5 10 20 ${REQ_TOL} ${CHECK_TOL} ${PREC}
-            ${UPSAMP}
+        COMMAND cufinufft3d_test 0 1 2 5 10 20 ${REQ_TOL} ${CHECK_TOL} ${PREC} ${UPSAMP}
     )
 
     add_test(
         NAME cufinufft3d1_test_GM_${PREC}_${UPSAMP}
-        COMMAND
-            cufinufft3d_test 1 1 2 5 10 20 ${REQ_TOL} ${CHECK_TOL} ${PREC}
-            ${UPSAMP}
+        COMMAND cufinufft3d_test 1 1 2 5 10 20 ${REQ_TOL} ${CHECK_TOL} ${PREC} ${UPSAMP}
     )
 
     if(${PREC} STREQUAL "float")
         add_test(
             NAME cufinufft3d1_test_SM_${PREC}_${UPSAMP}
-            COMMAND
-                cufinufft3d_test 2 1 2 5 10 20 ${REQ_TOL} ${CHECK_TOL} ${PREC}
-                ${UPSAMP}
+            COMMAND cufinufft3d_test 2 1 2 5 10 20 ${REQ_TOL} ${CHECK_TOL} ${PREC} ${UPSAMP}
         )
 
         add_test(
             NAME cufinufft3d1_test_block_${PREC}_${UPSAMP}
-            COMMAND
-                cufinufft3d_test 4 1 2 5 10 20 ${REQ_TOL} ${CHECK_TOL} ${PREC}
-                ${UPSAMP}
+            COMMAND cufinufft3d_test 4 1 2 5 10 20 ${REQ_TOL} ${CHECK_TOL} ${PREC} ${UPSAMP}
         )
 
         add_test(
             NAME cufinufft3d2_test_SM_${PREC}_${UPSAMP}
-            COMMAND
-                cufinufft3d_test 2 2 2 5 10 20 ${REQ_TOL} ${CHECK_TOL} ${PREC}
-                ${UPSAMP}
+            COMMAND cufinufft3d_test 2 2 2 5 10 20 ${REQ_TOL} ${CHECK_TOL} ${PREC} ${UPSAMP}
         )
 
         add_test(
             NAME cufinufft3d3_test_SM_${PREC}_${UPSAMP}
-            COMMAND
-                cufinufft3d_test 2 3 2 5 10 30 ${REQ_TOL} ${CHECK_TOL} ${PREC}
-                ${UPSAMP}
+            COMMAND cufinufft3d_test 2 3 2 5 10 30 ${REQ_TOL} ${CHECK_TOL} ${PREC} ${UPSAMP}
         )
     endif()
 
     add_test(
         NAME cufinufft3d2_test_auto_${PREC}_${UPSAMP}
-        COMMAND
-            cufinufft3d_test 0 2 2 5 10 20 ${REQ_TOL} ${CHECK_TOL} ${PREC}
-            ${UPSAMP}
+        COMMAND cufinufft3d_test 0 2 2 5 10 20 ${REQ_TOL} ${CHECK_TOL} ${PREC} ${UPSAMP}
     )
 
     add_test(
         NAME cufinufft3d2_test_GM_${PREC}_${UPSAMP}
-        COMMAND
-            cufinufft3d_test 1 2 2 5 10 20 ${REQ_TOL} ${CHECK_TOL} ${PREC}
-            ${UPSAMP}
+        COMMAND cufinufft3d_test 1 2 2 5 10 20 ${REQ_TOL} ${CHECK_TOL} ${PREC} ${UPSAMP}
     )
 
     add_test(
         NAME cufinufft3d3_test_auto_${PREC}_${UPSAMP}
-        COMMAND
-            cufinufft3d_test 0 3 2 5 10 30 ${REQ_TOL} ${CHECK_TOL} ${PREC}
-            ${UPSAMP}
+        COMMAND cufinufft3d_test 0 3 2 5 10 30 ${REQ_TOL} ${CHECK_TOL} ${PREC} ${UPSAMP}
     )
 
     add_test(
         NAME cufinufft3d3_test_GM_${PREC}_${UPSAMP}
-        COMMAND
-            cufinufft3d_test 1 3 2 3 7 20 ${REQ_TOL} ${CHECK_TOL}*100 ${PREC}
-            ${UPSAMP}
+        COMMAND cufinufft3d_test 1 3 2 3 7 20 ${REQ_TOL} ${CHECK_TOL}*100 ${PREC} ${UPSAMP}
     )
 endfunction()
 
diff --git a/test/spreadinterp1d_test.cpp b/test/spreadinterp1d_test.cpp
index 981ae70c8..4aecc9c88 100644
--- a/test/spreadinterp1d_test.cpp
+++ b/test/spreadinterp1d_test.cpp
@@ -63,7 +63,7 @@ int main(int argc, char *argv[]) {
   CPX kersum = 0.0;
   for (auto Fk : F) kersum += Fk; // one kernel mass
 
-    // generate random nonuniform points (x) and complex strengths (c)
+  // generate random nonuniform points (x) and complex strengths (c)
 #pragma omp parallel
   {
     unsigned int se = MY_OMP_GET_THREAD_NUM();   // needed for parallel random #s
diff --git a/tutorial/README b/tutorial/README
index 89e61c2b1..7b037933a 100644
--- a/tutorial/README
+++ b/tutorial/README
@@ -4,4 +4,3 @@ So far, this is a collection of MATLAB/Octave codes,
 which make use of utilities in utils/
 
 See the "Tutorials and application demos" section of the documentation.
-
diff --git a/tutorial/inv1d2.m b/tutorial/inv1d2.m
index 179c1b794..3030f92e3 100644
--- a/tutorial/inv1d2.m
+++ b/tutorial/inv1d2.m
@@ -39,7 +39,7 @@
   [f,flag,relres,iter] = pcg(@(f) applyToep(f,vhat), rhs, 1e-6, N);
   fprintf('CG-Toep relres %.3g done in %d iters, %.3g s\n', relres,iter,toc)
 end
-  
+
 yrecon = finufft1d2(x,+1,tol,f);
 fprintf('\trel l2 resid of Af=y: %.3g\n', norm(yrecon-y)/norm(y))
 fprintf('\trel l2 coeff err: %.3g\n', norm(f-ftrue)/norm(ftrue))
diff --git a/tutorial/serieseval1d.m b/tutorial/serieseval1d.m
index 222000d68..7838a3c47 100644
--- a/tutorial/serieseval1d.m
+++ b/tutorial/serieseval1d.m
@@ -43,5 +43,5 @@
 hold on; plot(yi,real(fi),'r.'); legend('NU pts','unif pts');   % eye norm is ok
 
 % math check: send the unif pts into NUFFT and compare answers
-ci = finufft1d2(yi * (2*pi/L),+1,tol,fk); 
+ci = finufft1d2(yi * (2*pi/L),+1,tol,fk);
 fprintf('max error on %d unif test pts; %.3g\n',Mp,norm(fi-ci,inf))
diff --git a/tutorial/utils/lgwt.m b/tutorial/utils/lgwt.m
index 6b6040374..0218e957b 100644
--- a/tutorial/utils/lgwt.m
+++ b/tutorial/utils/lgwt.m
@@ -1,7 +1,7 @@
 function [x,w]=lgwt(N,a,b)
 % lgwt.m
 %
-% This script is for computing definite integrals using Legendre-Gauss 
+% This script is for computing definite integrals using Legendre-Gauss
 % Quadrature. Computes the Legendre-Gauss nodes and weights  on an interval
 % [a,b] with truncation order N
 %
@@ -12,29 +12,29 @@
 %
 % Written by Greg von Winckel - 02/25/2004
 %
-% Copyright (c) 2009, Greg von Winckel 
+% Copyright (c) 2009, Greg von Winckel
 % All rights reserved.
 %
-% Redistribution and use in source and binary forms, with or without 
-% modification, are permitted provided that the following conditions are 
+% Redistribution and use in source and binary forms, with or without
+% modification, are permitted provided that the following conditions are
 % met:
 %
-% * Redistributions of source code must retain the above copyright 
-% notice, this list of conditions and the following disclaimer. 
-% * Redistributions in binary form must reproduce the above copyright 
-% notice, this list of conditions and the following disclaimer in 
+% * Redistributions of source code must retain the above copyright
+% notice, this list of conditions and the following disclaimer.
+% * Redistributions in binary form must reproduce the above copyright
+% notice, this list of conditions and the following disclaimer in
 % the documentation and/or other materials provided with the distribution
 %
-% THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
-% AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
-% IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
-% ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
-% LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
-% CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
-% SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
-% INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
-% CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
-% ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+% THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+% AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+% IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+% ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+% LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+% CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+% SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+% INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+% CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+% ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 % POSSIBILITY OF SUCH DAMAGE.
 %
     N=N-1;
@@ -48,18 +48,18 @@
     % using the recursion relation and the Newton-Raphson method
     y0=2;
     % Iterate until new points are uniformly within epsilon of old points
-    while max(abs(y-y0))>eps      
+    while max(abs(y-y0))>eps
         L(:,1)=1;
         L(:,2)=y;
         for k=2:N1
             L(:,k+1)=( (2*k-1)*y.*L(:,k)-(k-1)*L(:,k-1) )/k;
-        end        
-        Lp=(N2)*( L(:,N1)-y.*L(:,N2) )./(1-y.^2);           
+        end
+        Lp=(N2)*( L(:,N1)-y.*L(:,N2) )./(1-y.^2);
         y0=y;
-        y=y0-L(:,N2)./Lp;        
+        y=y0-L(:,N2)./Lp;
     end
     % Linear map from[-1,1] to [a,b]
-    x=flipud((a*(1-y)+b*(1+y))/2);      
+    x=flipud((a*(1-y)+b*(1+y))/2);
     % Compute the weights
     w=(b-a)./((1-y.^2).*Lp.^2)*(N2/N1)^2;
 end

From 82b50a6091571ca81228b2b402e027928b5eb9fc Mon Sep 17 00:00:00 2001
From: Marco Barbone <mbarbone@flatironinstitute.org>
Date: Tue, 25 Feb 2025 16:42:23 -0500
Subject: [PATCH 2/5] ignoring

---
 .git-blame-ignore-revs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs
index 72361d1e3..009c21d34 100644
--- a/.git-blame-ignore-revs
+++ b/.git-blame-ignore-revs
@@ -1,2 +1,3 @@
 # Applied clang format to the codebase
 884ba427be0c60aa3399d5ea71b0e9e3a7cbf686
+b1e484fb294b2759d3b6b1f68ca0bf5e255b87d1

From b292fb01af57545db97d0c09fd4cccccef6b4b16 Mon Sep 17 00:00:00 2001
From: Marco Barbone <DiamonDinoia@users.noreply.github.com>
Date: Thu, 27 Feb 2025 11:22:21 -0500
Subject: [PATCH 3/5] fixes in cmake for cuda (#640)

Removing duplicated FINUFFT_CUDA_ARCHITECTURES & enabling LTO on CPU
---
 CHANGELOG                    |  2 ++
 CMakeLists.txt               | 29 ++++++++++++++++-------------
 cmake/utils.cmake            | 11 +++++++++++
 docs/install_gpu.rst         |  2 ++
 perftest/cuda/CMakeLists.txt |  2 +-
 src/cuda/CMakeLists.txt      | 20 ++++++++++++--------
 test/CMakeLists.txt          |  1 -
 test/cuda/CMakeLists.txt     |  4 ++--
 8 files changed, 46 insertions(+), 25 deletions(-)

diff --git a/CHANGELOG b/CHANGELOG
index 4798ab4bd..da32b1b87 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -3,6 +3,8 @@ If not stated, FINUFFT is assumed (cuFINUFFT <=1.3 is listed separately).
 
 Master, using release name V 2.4.0 (2/11/25)
 
+* Removed FINUFFT_CUDA_ARCHITECTURES flag, as it was unnecessary duplication.
+* Enabled LTO for finufft, nvcc support is flaky at the moment.
 * Added GPU spread interp only test. Added CPU spread interp only test to cmake
 * Make attributes private in Python Plan classes and allow read-only access to
   them using properties (Andén #608).
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6a9d93d93..8d21cfc3f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -25,8 +25,6 @@ option(FINUFFT_BUILD_DOCS "Whether to build the FINUFFT documentation" OFF)
 # if FINUFFT_USE_DUCC0 is ON, the following options are ignored
 set(FINUFFT_FFTW_LIBRARIES "DEFAULT" CACHE STRING "Specify a custom FFTW library")
 set(FINUFFT_FFTW_SUFFIX "OpenMP" CACHE STRING "Suffix for FFTW libraries (e.g. OpenMP, Threads etc.)")
-# if FINUFFT_USE_CUDA is OFF, the following options are ignored
-set(FINUFFT_CUDA_ARCHITECTURES "native" CACHE STRING "CUDA architectures to build for (e.g. 60;70;75;)")
 # if FINUFFT_USE_CPU is OFF, the following options are ignored
 set(FINUFFT_ARCH_FLAGS "native" CACHE STRING "Compiler flags for specifying target architecture, defaults to -march=native")
 # sphinx tag (don't remove): @cmake_opts_end
@@ -77,7 +75,8 @@ set(FINUFFT_CXX_FLAGS_DEBUG
     -ggdb
     -ggdb3
     -Wall
-    -Wno-sign-compare
+    -Wextra
+    -Wpedantic
     -Wno-unknown-pragmas
 )
 
@@ -208,7 +207,12 @@ function(finufft_link_test target)
     endif()
     enable_asan(${target})
     target_compile_features(${target} PRIVATE cxx_std_17)
-    set_target_properties(${target} PROPERTIES MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>")
+    set_target_properties(
+        ${target}
+        PROPERTIES
+            MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>"
+            POSITION_INDEPENDENT_CODE ${FINUFFT_POSITION_INDEPENDENT_CODE}
+    )
     # disable deprecated warnings for tests if supported
     if(FINUFFT_HAS_NO_DEPRECATED_DECLARATIONS)
         target_compile_options(${target} PRIVATE -Wno-deprecated-declarations)
@@ -277,16 +281,14 @@ if(FINUFFT_USE_CPU)
 endif()
 
 if(FINUFFT_USE_CUDA)
-    if(NOT DEFINED FINUFFT_CUDA_ARCHITECTURES)
-        if(DEFINED CMAKE_CUDA_ARCHITECTURES)
-            set(FINUFFT_CUDA_ARCHITECTURES "{$CMAKE_CUDA_ARCHITECTURES}")
-        else()
-            message(
-                "FINUFFT WARNING: No CUDA architecture supplied via '-DFINUFFT_CUDA_ARCHITECTURES=...', defaulting to 'native'"
-            )
-            message("See: https://developer.nvidia.com/cuda-gpus for more details on what architecture to supply.")
-        endif()
+    if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
+        message(
+            WARNING
+            "FINUFFT WARNING: No CUDA architecture supplied via '-DCMAKE_CUDA_ARCHITECTURES=...', defaulting to 'native'"
+        )
+        message(WARNING "See: https://developer.nvidia.com/cuda-gpus for more details on what architecture to supply.")
     endif()
+    set(CMAKE_CUDA_ARCHITECTURES "native")
     enable_language(CUDA)
     find_package(CUDAToolkit REQUIRED)
     add_subdirectory(src/cuda)
@@ -348,6 +350,7 @@ message(STATUS "  FINUFFT_FFTW_SUFFIX: ${FINUFFT_FFTW_SUFFIX}")
 message(STATUS "  FINUFFT_FFTW_LIBRARIES: ${FINUFFT_FFTW_LIBRARIES}")
 message(STATUS "  FINUFFT_ARCH_FLAGS: ${FINUFFT_ARCH_FLAGS}")
 message(STATUS "  FINUFFT_USE_DUCC0: ${FINUFFT_USE_DUCC0}")
+message(STATUS "  CMAKE_CUDA_ARCHITECTURES: ${CMAKE_CUDA_ARCHITECTURES}")
 # gersemi: on
 
 if(FINUFFT_ENABLE_INSTALL)
diff --git a/cmake/utils.cmake b/cmake/utils.cmake
index bcbbb5169..b5571605f 100644
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -75,3 +75,14 @@ function(copy_dll source_target destination_target)
     unset(SOURCE_FILE)
     unset(DESTINATION_FILE)
 endfunction()
+
+include(CheckIPOSupported)
+check_ipo_supported(RESULT LTO_SUPPORTED OUTPUT LTO_ERROR)
+
+if(LTO_SUPPORTED)
+    message(STATUS "LTO is supported and enabled.")
+    set(FINUFFT_INTERPROCEDURAL_OPTIMIZATION TRUE)
+else()
+    message(WARNING "LTO is not supported: ${LTO_ERROR}")
+    set(FINUFFT_INTERPROCEDURAL_OPTIMIZATION FALSE)
+endif()
diff --git a/docs/install_gpu.rst b/docs/install_gpu.rst
index ec6a74b60..0c5b4f312 100644
--- a/docs/install_gpu.rst
+++ b/docs/install_gpu.rst
@@ -45,6 +45,8 @@ To find out your own device's compute capability without having to look it up on
 This will return a text string such as ``8.6`` which would incidate
 ``sm_86`` architecture, thus to use ``CMAKE_CUDA_ARCHITECTURES=86``.
 
+Note that by default the ``CMAKE_CUDA_ARCHITECTURES`` flag is set to ``native``, which means that the code will be compiled for the compute capability of the GPU on which the code is being compiled.
+This might not be portable so it is recommended to set this flag explicitly when building for multiple systems. A good alternative is ``all-major`` which will compile for all major compute capabilities.
 
 Testing
 -------
diff --git a/perftest/cuda/CMakeLists.txt b/perftest/cuda/CMakeLists.txt
index fdf04e723..6f440ed35 100644
--- a/perftest/cuda/CMakeLists.txt
+++ b/perftest/cuda/CMakeLists.txt
@@ -7,7 +7,7 @@ set_target_properties(
     cuperftest
     PROPERTIES
         LINKER_LANGUAGE CUDA
-        CUDA_ARCHITECTURES "${FINUFFT_CUDA_ARCHITECTURES}"
+        CUDA_ARCHITECTURES "${CMAKE_CUDA_ARCHITECTURES}"
         CUDA_STANDARD 17
         CUDA_STANDARD_REQUIRED ON
 )
diff --git a/src/cuda/CMakeLists.txt b/src/cuda/CMakeLists.txt
index eccebb17a..83eba1c96 100644
--- a/src/cuda/CMakeLists.txt
+++ b/src/cuda/CMakeLists.txt
@@ -34,18 +34,14 @@ set(FINUFFT_CUDA_FLAGS
     -fmad=true
     -restrict
     --extra-device-vectorization
-    $<$<CONFIG:Debug>:-G
-    -maxrregcount
-    64
-    >
-    >
+    -Xnvlink
+    --strip-all>
 )
 
 if(FINUFFT_SHARED_LINKING)
     add_library(cufinufft SHARED ${PRECISION_INDEPENDENT_SRC} ${PRECISION_DEPENDENT_SRC})
 else()
     add_library(cufinufft STATIC ${PRECISION_INDEPENDENT_SRC} ${PRECISION_DEPENDENT_SRC})
-    set_target_properties(cufinufft PROPERTIES POSITION_INDEPENDENT_CODE ${FINUFFT_POSITION_INDEPENDENT_CODE})
 endif()
 target_include_directories(cufinufft PUBLIC ${CUFINUFFT_INCLUDE_DIRS})
 # set target build location
@@ -54,20 +50,28 @@ set_target_properties(cufinufft PROPERTIES LIBRARY_OUTPUT_DIRECTORY "${PROJECT_B
 set_target_properties(
     cufinufft
     PROPERTIES
-        CUDA_ARCHITECTURES "${FINUFFT_CUDA_ARCHITECTURES}"
+        CUDA_ARCHITECTURES "${CMAKE_CUDA_ARCHITECTURES}"
         CUDA_SEPARABLE_COMPILATION ON
         CUDA_STANDARD 17
         CUDA_STANDARD_REQUIRED ON
         WINDOWS_EXPORT_ALL_SYMBOLS ON
         ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
+        INTERPROCEDURAL_OPTIMIZATION
+            OFF # LTO is not supported for CUDA for now
+        POSITION_INDEPENDENT_CODE ${FINUFFT_POSITION_INDEPENDENT_CODE}
 )
 target_compile_features(cufinufft PRIVATE cxx_std_17)
 target_compile_options(cufinufft PRIVATE ${FINUFFT_CUDA_FLAGS})
-if(WIN32 OR (BUILD_TESTING AND FINUFFT_BUILD_TESTS))
+if(WIN32 OR (BUILD_TESTING AND FINUFFT_BUILD_TESTS) OR env{CIBUILDWHEEL})
     target_link_libraries(cufinufft PUBLIC CUDA::cudart CUDA::cufft)
 else()
     target_link_libraries(cufinufft PUBLIC CUDA::cudart_static CUDA::cufft_static)
 endif()
 
+# disable deprecated warnings for tests if supported
+if(FINUFFT_HAS_NO_DEPRECATED_DECLARATIONS)
+    target_compile_options(cufinufft PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-Wno-deprecated-declarations>)
+endif()
+
 file(GLOB CUFINUFFT_PUBLIC_HEADERS "${CMAKE_SOURCE_DIR}/include/cufinufft*.h")
 set_target_properties(cufinufft PROPERTIES PUBLIC_HEADER "${CUFINUFFT_PUBLIC_HEADERS}")
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 71b70ef3c..7f2184fec 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -40,7 +40,6 @@ if(NOT FINUFFT_USE_DUCC0)
     add_executable(fftw_lock_test fftw_lock_test.cpp)
     target_compile_features(fftw_lock_test PRIVATE cxx_std_17)
     finufft_link_test(fftw_lock_test)
-
     add_test(NAME run_fftw_lock_test COMMAND fftw_lock_test WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
 endif()
 
diff --git a/test/cuda/CMakeLists.txt b/test/cuda/CMakeLists.txt
index 01de47cc4..3e1bada51 100644
--- a/test/cuda/CMakeLists.txt
+++ b/test/cuda/CMakeLists.txt
@@ -13,12 +13,12 @@ foreach(srcfile ${test_src})
     target_compile_features(${executable} PUBLIC cxx_std_17)
     set_target_properties(
         ${executable}
-        PROPERTIES LINKER_LANGUAGE CUDA CUDA_ARCHITECTURES "${FINUFFT_CUDA_ARCHITECTURES}"
+        PROPERTIES LINKER_LANGUAGE CUDA CUDA_ARCHITECTURES "${CMAKE_CUDA_ARCHITECTURES}"
     )
     message(
         STATUS
         "Adding test ${executable}"
-        " with CUDA_ARCHITECTURES=${FINUFFT_CUDA_ARCHITECTURES}"
+        " with CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}"
         " and INCLUDE=${CUFINUFFT_INCLUDE_DIRS}"
     )
 endforeach()

From 30de07872b31030a08944874fd41818e562f0b65 Mon Sep 17 00:00:00 2001
From: Marco Barbone <DiamonDinoia@users.noreply.github.com>
Date: Thu, 27 Feb 2025 17:55:39 -0500
Subject: [PATCH 4/5] initial upsampfact tweak (#638)

Adding two python utils to generate a dataset of performance between upsampfact 2.0 and 1.25 across multiple densities.
Using it to populate a heuristic that chooses the best one. In some cases this is a significant speedup.
---
 CHANGELOG                      |   2 +
 devel/analyse_upsamp.py        |  65 ++++++
 devel/wisdom.py                | 199 +++++++++++++++++++
 include/finufft/heuristics.hpp | 353 +++++++++++++++++++++++++++++++++
 src/finufft_core.cpp           |  22 +-
 5 files changed, 629 insertions(+), 12 deletions(-)
 create mode 100644 devel/analyse_upsamp.py
 create mode 100644 devel/wisdom.py
 create mode 100644 include/finufft/heuristics.hpp

diff --git a/CHANGELOG b/CHANGELOG
index da32b1b87..c7112b41f 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -3,6 +3,8 @@ If not stated, FINUFFT is assumed (cuFINUFFT <=1.3 is listed separately).
 
 Master, using release name V 2.4.0 (2/11/25)
 
+* Tweaked choice of upsampfact to use a density based heuristic for type 1 and 2
+  in the CPU library. This gives a significant speedup for some cases.
 * Removed FINUFFT_CUDA_ARCHITECTURES flag, as it was unnecessary duplication.
 * Enabled LTO for finufft, nvcc support is flaky at the moment.
 * Added GPU spread interp only test. Added CPU spread interp only test to cmake
diff --git a/devel/analyse_upsamp.py b/devel/analyse_upsamp.py
new file mode 100644
index 000000000..b45bf3236
--- /dev/null
+++ b/devel/analyse_upsamp.py
@@ -0,0 +1,65 @@
+import pandas as pd
+import numpy as np
+
+# Load the dataset
+file_path = "wisdom.csv"  # Change this to the actual file path
+df = pd.read_csv(file_path)
+
+# Convert specified columns to numeric, coercing errors to NaN
+columns_to_convert = ['Num_pts', 'Time_2.0', 'Time_1.25']
+for col in columns_to_convert:
+    df[col] = pd.to_numeric(df[col], errors='coerce')
+
+# Drop rows with NaN values in these columns (optional, but recommended)
+df = df.dropna(subset=columns_to_convert)
+# scientific notation to Epsilon
+df['Epsilon'] = df['Epsilon'].apply(lambda x: f'{x:.1e}')
+
+# round Time_2.0 and Time_1.25 to 2 decimal places
+df['Time_2.0'] = df['Time_2.0'].apply(lambda x: round(x, 2))
+df['Time_1.25'] = df['Time_1.25'].apply(lambda x: round(x, 2))
+
+
+# Extracting grid dimensions and volume from 'Size' column
+def parse_grid_size(size):
+    try:
+        size = str(size).strip()
+        if 'x' in size:
+            dims = list(map(int, size.split('x')))
+            volume = np.prod(dims)
+            dim = len(dims)
+        else:
+            volume = int(size)
+            dim = 1
+        return volume, str(dim)
+    except ValueError:
+        return np.nan, np.nan
+
+
+df[['Volume', 'Dim']] = df['Size'].apply(lambda x: pd.Series(parse_grid_size(str(x))))
+
+# Compute density
+df['Density'] = df['Num_pts'] / df['Volume']
+# remove columns where Time_2.0 is > Time_1.25
+df = df[df['Time_2.0'] <= df['Time_1.25']]
+
+# drop columns where diff between Time_2.0 and Time_1.25 is less than 5%
+df['% Diff'] = abs(df['Time_2.0'] - df['Time_1.25']) / df['Time_1.25']
+df = df[df['% Diff'] >= .05]
+
+# keep rows where n_threads is 1
+df = df[df['n_threads'] == 16]
+# drop time columns
+df.drop(columns=['Time_2.0', 'Time_1.25', 'n_threads'], inplace=True)
+df.drop(columns=['% Diff', 'Size', 'Num_pts', 'Volume', 'Unnamed: 0'], inplace=True)
+# group by NUFFT_type, Data_type, Density, Dim, n_threads to make print more readable, concatenate values do not aggregate them
+# Group by specific columns and concatenate values instead of aggregating them
+grouped_df = df.groupby(['NUFFT_type', 'Data_type', 'Density', 'Dim'], as_index=False).agg(lambda x: ', '.join(x))
+# grouped_df['Epsilon'] = grouped_df['Epsilon'].apply(lambda x: "{:.1e}".format(float(x)))
+
+# sort by NUFFT type first, then Dim and then Density
+grouped_df = grouped_df.sort_values(by=['NUFFT_type', 'Data_type', 'Dim', 'Density'])
+grouped_df = grouped_df[['NUFFT_type', 'Data_type', 'Dim', 'Density', 'Epsilon']]
+
+# Display the final dataset
+print(grouped_df.to_string(index=False))
diff --git a/devel/wisdom.py b/devel/wisdom.py
new file mode 100644
index 000000000..9a3baa3ec
--- /dev/null
+++ b/devel/wisdom.py
@@ -0,0 +1,199 @@
+import numpy as np
+import finufft
+import timeit
+import statistics
+import matplotlib.pyplot as plt
+import os
+import pandas as pd
+import sys
+
+sys.stdout.reconfigure(line_buffering=True)  # Ensures auto-flushing
+
+
+# Global list for collecting detailed benchmark rows
+all_results = []
+
+def benchmark_function(func, *args, runs=5, **kwargs):
+    """Runs the function multiple times and returns average & std dev."""
+    runtimes = [
+        timeit.timeit(lambda: func(*args, **kwargs), number=1)
+        for _ in range(runs)
+    ]
+    avg_runtime = statistics.mean(runtimes)
+    stdev_runtime = statistics.stdev(runtimes) if runs > 1 else 0.0
+    return avg_runtime, stdev_runtime
+
+# cache functiojn results with decorator
+def generate_random_data(nufft_type, nufft_sizes, num_pts, dtype):
+    """Generates random NUFFT input data in the correct dtype."""
+    dim = len(nufft_sizes)
+    rng = np.random.Generator(np.random.SFC64(42))
+    # Set the correct complex dtype
+    complex_dtype = np.complex64 if dtype == np.float32 else np.complex128
+
+    # Generate data for nonuniform points and coefficients
+    x = [np.array(2 * np.pi * rng.random(num_pts) - np.pi, dtype=dtype) for _ in range(dim)]
+    c = np.array(rng.random(num_pts) + 1j * rng.random(num_pts), dtype=complex_dtype)
+
+    f = rng.standard_normal(nufft_sizes, dtype=dtype) + 1j * rng.standard_normal(nufft_sizes, dtype=dtype) if nufft_type == 2 else None
+    d = [np.array(2 * np.pi * rng.random(num_pts) - np.pi, dtype=dtype) for _ in range(dim)] if nufft_type == 3 else None
+    return x, c, f, d
+
+def run_nufft(nufft_type, nufft_sizes, epsilon, n_threads, upsampfac, x, c, f, d):
+    """Runs NUFFT with the correct dtype and parameters."""
+    opts = {'nthreads': n_threads, 'upsampfac': upsampfac, 'debug': 0}
+    dim = len(nufft_sizes)
+
+    if nufft_type == 1:
+        if dim == 1:
+            return finufft.nufft1d1(x[0], c, nufft_sizes[0], eps=epsilon, **opts)
+        elif dim == 2:
+            return finufft.nufft2d1(x[0], x[1], c, nufft_sizes, eps=epsilon, **opts)
+        elif dim == 3:
+            return finufft.nufft3d1(x[0], x[1], x[2], c, nufft_sizes, eps=epsilon, **opts)
+    elif nufft_type == 2:
+        if dim == 1:
+            return finufft.nufft1d2(x[0], f, eps=epsilon, **opts)
+        elif dim == 2:
+            return finufft.nufft2d2(x[0], x[1], f, eps=epsilon, **opts)
+        elif dim == 3:
+            return finufft.nufft3d2(x[0], x[1], x[2], f, eps=epsilon, **opts)
+    elif nufft_type == 3:
+        if dim == 1:
+            return finufft.nufft1d3(x[0], c, d[0], eps=epsilon, **opts)
+        elif dim == 2:
+            return finufft.nufft2d3(x[0], x[1], c, d[0], d[1], eps=epsilon, **opts)
+        elif dim == 3:
+            return finufft.nufft3d3(x[0], x[1], x[2], c, d[0], d[1], d[2], eps=epsilon, **opts)
+
+    else:
+        raise ValueError("Invalid NUFFT type. Use 1, 2, or 3.")
+
+def benchmark_nufft_collection(nufft_type, nufft_sizes, num_pts, epsilons, n_threads, upsampfacs, dtype, runs=5,
+                               description=""):
+    """Runs benchmarks while measuring performance across densities and records detailed results."""
+    results = {upsamp: [] for upsamp in upsampfacs}
+    x, c, f, d = generate_random_data(nufft_type, nufft_sizes, num_pts, dtype)
+
+    # Compute density
+    size_product = np.prod(nufft_sizes)
+    density = num_pts / size_product
+
+    title = (f"NUFFT Type {nufft_type}, {len(nufft_sizes)}D, {dtype.__name__} "
+             f"(Size: {'x'.join(map(str, nufft_sizes))}, Num Pts: {num_pts}, Density: {density:.2f}, Threads: {n_threads})")
+    print(f"\n=== DEBUG: {title} ===")
+    print(f"{'Epsilon':<10} | {'1.25s':<12} | {'2.0s':<12} | % Diff")
+
+    for epsilon in epsilons:
+        runtimes = {}
+        for upsamp in upsampfacs:
+            avg_runtime, _ = benchmark_function(
+                run_nufft, nufft_type, nufft_sizes, epsilon, n_threads, upsamp, x, c, f, d, runs=runs
+            )
+            runtimes[upsamp] = avg_runtime
+            results[upsamp].append(avg_runtime)
+
+        diff = ((runtimes[2.0] - runtimes[1.25]) / runtimes[1.25]) * 100
+        print(f"{epsilon:<10.1e} | {runtimes[1.25]:<12.6f} | {runtimes[2.0]:<12.6f} | {diff:+.1f}%")
+
+        # Append a row of results for this epsilon value
+        all_results.append({
+            'Epsilon': epsilon,
+            'Time_1.25': runtimes[1.25],
+            'Time_2.0': runtimes[2.0],
+            '% Diff': diff,
+            'NUFFT_type': nufft_type,
+            'Data_type': dtype.__name__,
+            'Size': 'x'.join(map(str, nufft_sizes)),
+            'Num_pts': num_pts,
+            'Density': density,
+            'n_threads': n_threads
+        })
+
+    plot_benchmark_results(results, epsilons, upsampfacs, title, density)
+    return results
+
+def plot_benchmark_results(results, epsilons, upsampfacs, title, density):
+    """Plots performance results while including density information."""
+    x_axis = np.arange(len(epsilons))
+    width = 0.35  # Bar width
+
+    fig, ax = plt.subplots(figsize=(10, 6))
+    colors = ['tab:blue', 'tab:orange']
+
+    for i, upsamp in enumerate(upsampfacs):
+        ax.bar(x_axis + (i - 0.5) * width, results[upsamp], width, label=f"upsampfac = {upsamp}", color=colors[i])
+
+    ax.set_xlabel("Epsilon")
+    ax.set_ylabel("Average Runtime (s)")
+    ax.set_title(f"{title} - Density {density:.2f}")
+    ax.set_xticks(x_axis)
+    ax.set_xticklabels([f"{eps:.0e}" for eps in epsilons])
+    ax.set_yscale("log")
+    ax.legend(loc='upper left', bbox_to_anchor=(1.05, 1))
+    plt.tight_layout()
+
+    os.makedirs("plots", exist_ok=True)
+    filename = f"plots/{title.replace(' ', '_').replace(',', '').replace(':', '').replace('.', '_')}.png"
+    plt.savefig(filename)
+    # plt.show()
+
+# Define parameters for benchmarking
+upsampfacs = [2.0, 1.25]
+runs = 5
+
+for n_threads in reversed([16]):
+    # total_elements = 100**3 if n_threads == 1 else 216**3
+    total_elements = 100**3 if n_threads == 1 else 216**3
+    # Select test dimensions for 1D, 2D, and 3D
+    size_1d = (int(total_elements),)
+    size_2d = (int(np.sqrt(total_elements)), int(np.sqrt(total_elements)))
+    size_3d = (int(np.cbrt(total_elements)), int(np.cbrt(total_elements)), int(np.cbrt(total_elements)))
+
+    # Define num_pts range: starts with 1/16th of volume, ends with 1024x the volume
+    volume_1d = np.prod(size_1d)
+    volume_2d = np.prod(size_2d)
+    volume_3d = np.prod(size_3d)
+
+    num_pts_range = lambda volume: [volume // 16 * (2**i) for i in range(10)]
+
+
+    test_cases = []
+    for nufft_type in [2, 1]:
+            for size, desc in [(size_1d, "1D"), (size_2d, "2D"), (size_3d, "3D")]:
+                for num_pts in reversed(num_pts_range(np.prod(size))):
+                    test_cases.append({
+                        "nufft_type": nufft_type,
+                        "nufft_sizes": size,
+                        "num_pts": num_pts,
+                        "n_threads": n_threads,
+                        "description": f"NUFFT Type {nufft_type}, {desc}, Threads {n_threads}, Size {'x'.join(map(str, size))}, Num Pts {num_pts}"
+                    })
+
+# Run benchmarks and generate plots for each test case and for both float32 and float64.
+for case in test_cases:
+    for dtype in [np.float32, np.float64]:
+        epsilons = np.logspace(-1, -6, num=6) if dtype == np.float32 else np.logspace(-1, -9, num=9)
+
+        print(f'RUNNING TEST CASE : {case["description"]} with dtype : {dtype.__name__} epsilons : {epsilons}')
+        benchmark_nufft_collection(
+            case["nufft_type"],
+            case["nufft_sizes"],
+            case["num_pts"],
+            epsilons,
+            case["n_threads"],
+            upsampfacs,
+            dtype,
+            runs=runs,
+            description=case["description"]
+        )
+
+# After all benchmarks are done, build and print the final results table.
+df = pd.DataFrame(all_results)
+# Reorder columns as desired.
+df = df[['Epsilon', 'Time_1.25', 'Time_2.0', '% Diff', 'NUFFT_type', 'Data_type', 'Size', 'Num_pts', 'Density', 'n_threads']]
+print("\nFinal Benchmark Results:")
+print(df.to_string(index=False))
+df.to_csv('wisdom.csv')
+df.to_latex('wisdom.tex')
+df.to_markdown('wisdom.md')
diff --git a/include/finufft/heuristics.hpp b/include/finufft/heuristics.hpp
new file mode 100644
index 000000000..462fab9f6
--- /dev/null
+++ b/include/finufft/heuristics.hpp
@@ -0,0 +1,353 @@
+#ifndef HEURISTICS_HPP
+#define HEURISTICS_HPP
+
+#include <cassert>
+#include <cmath>
+#include <iostream>
+#include <string>
+#include <type_traits>
+#include <vector>
+
+namespace finufft::heuristics {
+#ifndef FINUFFT_USE_DUCC0
+template<typename T>
+static double bestUpsamplingFactorSinglethread(
+    const double density, const int dim, const int nufftType, const double epsilon) {
+
+  constexpr bool isFloat = std::is_same_v<T, float>;
+
+  if constexpr (isFloat) { // single precision
+    if (nufftType == 1) {
+      if (dim == 2 && density >= 32 && epsilon <= 1.0e-5) {
+        return 2.0;
+      }
+      if (dim == 3 && density >= 2 && epsilon <= 1.0e-5) {
+        return 2.0;
+      }
+    } // end nufftType == 1
+    if (nufftType == 2) {
+      if (dim == 1 && density >= 8 && epsilon <= 1.0e-5) {
+        return 2.0;
+      }
+      if (dim == 3) {
+        if (density >= 32 && epsilon <= 1.0e-3) {
+          return 2.0;
+        }
+        if (density >= 4 && epsilon <= 1.0e-4) {
+          return 2.0;
+        }
+        if (density >= 1 && epsilon <= 1.0e-5) {
+          return 2.0;
+        }
+        return 2.0;
+      } // end dim == 3
+    } // end nufftType == 2
+  } else { // end single precision, double precision follows
+    if (nufftType == 1) {
+      // interestingly for type 1, 1.25 is always faster in 1D
+      if (dim == 2) {
+        if (density >= 32 && epsilon <= 1.0e-6) {
+          return 2.0;
+        }
+        if (density >= 16 && epsilon <= 1.0e-7) {
+          return 2.0;
+        }
+        if (density >= 8 && epsilon <= 1.0e-8) {
+          return 2.0;
+        }
+      } // end dim == 2
+      if (dim == 3) {
+        if (density >= 16 && epsilon <= 1.0e-3) {
+          return 2.0;
+        }
+        if (density >= 2 && epsilon <= 1.0e-5) {
+          return 2.0;
+        }
+        if (density >= 1 && epsilon <= 1.0e-7) {
+          return 2.0;
+        }
+        if (epsilon <= 1.0e-8) { // for epsilon <= 1.0e-8, 2.0 is favored in 3D always
+          return 2.0;
+        }
+      } // end dim == 3
+    } // end nufftType == 1
+    if (nufftType == 2) {
+      if (dim == 1) {
+        if (density >= 4 && epsilon <= 1.0e-9) {
+          return 2.0;
+        }
+      } // end dim == 1
+      if (dim == 2) {
+        if (density >= 32 && epsilon <= 1.0e-7) {
+          return 2.0;
+        }
+      } // end dim == 2
+      if (dim == 3) {
+        if (density >= 8 && epsilon <= 1.0e-3) {
+          return 2.0;
+        }
+        if (density >= 4 && epsilon <= 1.0e-4) {
+          return 2.0;
+        }
+        if (density >= 1 && epsilon <= 1.0e-5) {
+          return 2.0;
+        }
+        if (density >= 0.5 && epsilon <= 1.0e-7) {
+          return 2.0;
+        }
+        if (density >= 0.25 && epsilon <= 1.0e-8) {
+          return 2.0;
+        }
+      } // end dim == 3
+    } // end nufftType == 2
+  }
+
+  return 1.25;
+}
+
+template<typename T>
+static double bestUpsamplingFactorMultithread(const double density, const int dim,
+                                              const int nufftType, const double epsilon) {
+  constexpr bool isFloat = std::is_same_v<T, float>;
+  if constexpr (isFloat) {
+    // in float multi-threaded, this is the only case where 2.0 is faster
+    if (dim == 3 && nufftType == 1 && epsilon <= 1.0e-6 && density >= 32) {
+      return 2.0;
+    }
+  } else {
+    // In not 3D, 1.25 is always better
+    if (dim != 3) return 1.25;
+
+    if (nufftType == 1) {
+      if (density >= 16 && epsilon <= 1.0e-5) {
+        return 2.0;
+      }
+      if (density >= 8 && epsilon <= 1.0e-7) {
+        return 2.0;
+      }
+      if (density >= 2 && epsilon <= 1.0e-8) {
+        return 2.0;
+      }
+      if (density >= 1 && epsilon <= 1.0e-9) {
+        return 2.0;
+      }
+    }
+    if (nufftType == 2) {
+      if (density >= 16 && epsilon <= 1.0e-5) {
+        return 2.0;
+      }
+      if (density >= 8 && epsilon <= 1.0e-6) {
+        return 2.0;
+      }
+      if (density >= 4 && epsilon <= 1.0e-7) {
+        return 2.0;
+      }
+      if (density >= 2 && epsilon <= 1.0e-8) {
+        return 2.0;
+      }
+      if (density >= 1 && epsilon <= 1.0e-9) {
+        return 2.0;
+      }
+    }
+  }
+
+  return 1.25;
+}
+
+#else
+
+template<typename T>
+static double bestUpsamplingFactorSinglethread(
+    const double density, const int dim, const int nufftType, const double epsilon) {
+
+  constexpr bool isFloat = std::is_same_v<T, float>;
+
+  if constexpr (isFloat) { // single precision
+    if (nufftType == 1) {
+      if (dim == 2 && density >= 4 && epsilon <= 1.0e-5) {
+        return 2.0;
+      }
+      // if we got here and it is not 3D return 1.25
+      if (dim != 3) {
+        return 1.25;
+      }
+      if (density >= 16 && epsilon <= 1.0e-3) {
+        return 2.0;
+      }
+      if (density >= 8 && epsilon <= 1.0e-4) {
+        return 2.0;
+      }
+      if (density >= 1 && epsilon <= 1.0e-5) {
+        return 2.0;
+      }
+
+    } // end nufftType == 1
+    if (nufftType == 2) {
+      if (dim == 1 && density >= 16 && epsilon <= 1.0e-5) {
+        return 2.0;
+      }
+      // if we got here and it is not 3D return 1.25
+      if (dim != 3) {
+        return 1.25;
+      }
+      // 3D cases only here
+      if (density >= 8 && epsilon <= 1.0e-3) {
+        return 2.0;
+      }
+      if (density >= 2 && epsilon <= 1.0e-4) {
+        return 2.0;
+      }
+      if (density >= 0.5 && epsilon <= 1.0e-5) {
+        return 2.0;
+      }
+      if (density >= 0.25 && epsilon <= 1.0e-6) {
+        return 2.0;
+      }
+    } // end nufftType == 2
+  } else { // end single precision, double precision follows
+    if (nufftType == 1) {
+      // interestingly for type 1, 1.25 is always faster in 1D
+      if (dim == 2) {
+        if (density >= 8 && epsilon <= 1.0e-7) {
+          return 2.0;
+        }
+      } // end dim == 2
+      if (dim == 3) {
+        if (density >= 16 && epsilon <= 1.0e-3) {
+          return 2.0;
+        }
+        if (density >= 8 && epsilon <= 1.0e-4) {
+          return 2.0;
+        }
+        if (density >= 2 && epsilon <= 1.0e-5) {
+          return 2.0;
+        }
+        if (density >= 1 && epsilon <= 1.0e-6) {
+          return 2.0;
+        }
+        if (density >= 0.5 && epsilon <= 1.0e-7) {
+          return 2.0;
+        }
+        if (density >= 0.25 && epsilon <= 1.0e-8) {
+          return 2.0;
+        }
+        if (density >= 0.125 && epsilon <= 1.0e-9) {
+          return 2.0;
+        }
+      } // end dim == 3
+    } // end nufftType == 1
+    if (nufftType == 2) {
+      if (dim == 1) {
+        return 1.25;
+      } // end dim == 1
+      if (dim == 2) {
+        if (density >= 16 && epsilon <= 1.0e-7) {
+          return 2.0;
+        }
+      } // end dim == 2
+      if (dim == 3) {
+        if (density >= 8 && epsilon <= 1.0e-3) {
+          return 2.0;
+        }
+        if (density >= 2 && epsilon <= 1.0e-4) {
+          return 2.0;
+        }
+        if (density >= 1 && epsilon <= 1.0e-5) {
+          return 2.0;
+        }
+        if (density >= 0.5 && epsilon <= 1.0e-6) {
+          return 2.0;
+        }
+        if (density >= 0.25 && epsilon <= 1.0e-8) {
+          return 2.0;
+        }
+        if (density >= 0.125 && epsilon <= 1.0e-9) {
+          return 2.0;
+        }
+      } // end dim == 3
+    } // end nufftType == 2
+  }
+  return 1.25;
+}
+
+template<typename T>
+static double bestUpsamplingFactorMultithread(const double density, const int dim,
+                                              const int nufftType, const double epsilon) {
+  constexpr bool isFloat = std::is_same_v<T, float>;
+  if constexpr (isFloat) {
+    if (dim == 3 && nufftType == 1 && epsilon <= 1.0e-5 && density >= 16) {
+      return 2.0;
+    }
+    if (dim == 3 && nufftType == 2 && epsilon <= 1.0e-5 && density >= 8) {
+      return 2.0;
+    }
+
+  } else {
+    // only case where 2.0 is faster in 2D
+    if (dim == 2 && nufftType == 1 && density >= 32 && epsilon <= 1.0e-8) {
+      return 2.0;
+    }
+    // the following are only 3D cases if it reaches here and is not 3D return 1.25
+    if (dim != 3) {
+      return 1.25;
+    }
+    if (nufftType == 1) { // same as fftw here
+      if (density >= 16 && epsilon <= 1.0e-5) {
+        return 2.0;
+      }
+      if (density >= 8 && epsilon <= 1.0e-7) {
+        return 2.0;
+      }
+      if (density >= 2 && epsilon <= 1.0e-8) {
+        return 2.0;
+      }
+      if (density >= 1 && epsilon <= 1.0e-9) {
+        return 2.0;
+      }
+    }
+    if (nufftType == 2) { // some different cases here
+      if (density >= 8 && epsilon <= 1.0e-5) {
+        return 2.0;
+      }
+      if (density >= 4 && epsilon <= 1.0e-7) {
+        return 2.0;
+      }
+      if (density >= 1 && epsilon <= 1.0e-8) {
+        return 2.0;
+      }
+    }
+  }
+
+  return 1.25;
+}
+
+#endif
+template<typename T>
+double bestUpsamplingFactor(const int nthreads, const double density, const int dim,
+                            const int nufftType, const double epsilon) {
+  // 1) For epsilons <= 1e-9, 1.25 is not supported
+  if (epsilon <= 1.0e-9) {
+    return 2.0;
+  }
+
+  // 2) Special-case for nufftType == 3
+  //    TODO: maybe use the bandwidth here?
+  if (nufftType == 3) {
+    return 1.25;
+  }
+  // 3) For 1 thread, use single-threaded heuristic
+  if (nthreads == 1) {
+    return bestUpsamplingFactorSinglethread<T>(density, dim, nufftType, epsilon);
+  }
+  // 4) For 2 threads, use multi-threaded heuristic
+  if (nthreads > 1) {
+    return bestUpsamplingFactorMultithread<T>(density, dim, nufftType, epsilon);
+  }
+
+  // 4) Otherwise, return 1.25
+  return 1.25;
+}
+
+} // end namespace finufft::heuristics
+
+#endif // HEURISTICS_HPP
diff --git a/src/finufft_core.cpp b/src/finufft_core.cpp
index fc36f1f03..6342c7856 100644
--- a/src/finufft_core.cpp
+++ b/src/finufft_core.cpp
@@ -1,6 +1,7 @@
 #include <finufft/fft.h>
 #include <finufft/finufft_core.h>
 #include <finufft/finufft_utils.hpp>
+#include <finufft/heuristics.hpp>
 #include <finufft/spreadinterp.h>
 
 #include "../contrib/legendre_rule_fast.h"
@@ -14,6 +15,7 @@ using namespace finufft;
 using namespace finufft::utils;
 using namespace finufft::spreadinterp;
 using namespace finufft::quadrature;
+using namespace finufft::heuristics;
 
 /* Computational core for FINUFFT.
 
@@ -268,7 +270,7 @@ template<typename T> class KernelFseries {
   FINUFFT_ALWAYS_INLINE T operator()(T k) {
     T x = 0;
     for (size_t n = 0; n < z.size(); ++n)
-      x += f[n] * 2 * cos(k * z[n]); // pos & neg freq pair.  use T cos!
+      x += f[n] * 2 * std::cos(k * z[n]); // pos & neg freq pair.  use T cos!
     return x;
   }
 };
@@ -632,18 +634,14 @@ FINUFFT_PLAN_T<TF>::FINUFFT_PLAN_T(int type_, int dim_, const BIGINT *n_modes, i
   }
 
   // heuristic to choose default upsampfac... (currently two poss)
-  if (opts.upsampfac == 0.0) {              // indicates auto-choose
-    opts.upsampfac = 2.0;                   // default, and need for tol small
-    if (tol >= (TF)1E-9) {                  // the tol sigma=5/4 can reach
-      if (type == 3)                        // could move to setpts, more known?
-        opts.upsampfac = 1.25;              // faster b/c smaller RAM & FFT
-      else if ((dim == 1 && N() > 10000000) || (dim == 2 && N() > 300000) ||
-               (dim == 3 && N() > 3000000)) // type 1,2 heuristic cutoffs, double,
-                                            // typ tol, 12-core xeon
-        opts.upsampfac = 1.25;
-    }
+  if (opts.upsampfac == 0.0) {                                   // indicates auto-choose
+    const auto density = double(nj) / double(N() > 0 ? N() : 1); // dumbinputs allows
+                                                                 // N()==0
+    opts.upsampfac = bestUpsamplingFactor<TF>(opts.nthreads, density, dim, type, tol);
     if (opts.debug > 1)
-      printf("[%s] set auto upsampfac=%.2f\n", __func__, opts.upsampfac);
+      printf("[%s] threads %d, density %.3g, dim %d, nufft type %d, tol %.3g: auto "
+             "upsampfac=%.2f\n",
+             __func__, opts.nthreads, density, dim, type, tol, opts.upsampfac);
   }
   // use opts to choose and write into plan's spread options...
   ier = setup_spreader_for_nufft(spopts, tol, opts, dim);

From c6860a8d81bfd79bc63a7a2eb289e4e89ddfed7f Mon Sep 17 00:00:00 2001
From: Marco Barbone <DiamonDinoia@users.noreply.github.com>
Date: Thu, 27 Feb 2025 19:42:59 -0500
Subject: [PATCH 5/5] Update CMakeLists.txt

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8d21cfc3f..a7213643a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -287,8 +287,8 @@ if(FINUFFT_USE_CUDA)
             "FINUFFT WARNING: No CUDA architecture supplied via '-DCMAKE_CUDA_ARCHITECTURES=...', defaulting to 'native'"
         )
         message(WARNING "See: https://developer.nvidia.com/cuda-gpus for more details on what architecture to supply.")
+        set(CMAKE_CUDA_ARCHITECTURES "native")
     endif()
-    set(CMAKE_CUDA_ARCHITECTURES "native")
     enable_language(CUDA)
     find_package(CUDAToolkit REQUIRED)
     add_subdirectory(src/cuda)