flatironinstitute · DiamonDinoia · Mar 28, 2025 · Mar 28, 2025 · Mar 28, 2025 · Apr 8, 2025
diff --git a/.github/workflows/build_cufinufft_wheels.yml b/.github/workflows/build_cufinufft_wheels.yml
@@ -11,13 +11,22 @@ jobs:
       matrix:
         buildplat:
           - [ ubuntu-22.04, manylinux_x86_64 ]
-          - [ windows-2019, win_amd64 ]
+          - [ windows-2022, win_amd64 ]
     steps:
       - uses: actions/checkout@v4
-      - uses: ilammy/msvc-dev-cmd@v1
+      - name: Setup Cpp
+        if: ${{ matrix.buildplat[0] == 'windows-2022' }}
+        uses: aminya/[email protected]
+        with:
+          compiler: msvc-2022
+          vcvarsall: true
+          cmake: true
+          ninja: true
+          vcpkg: false
+          cppcheck: false
       - name: Setup CUDA
-        if: ${{ matrix.buildplat[0] == 'windows-2019' }}
-        uses: Jimver/[email protected].21
+        if: ${{ matrix.buildplat[0] == 'windows-2022' }}
+        uses: Jimver/[email protected].23
         with:
           cuda: '12.4.0'
       - name: Build ${{ matrix.buildplat[1] }} wheels

diff --git a/CHANGELOG b/CHANGELOG
@@ -6,6 +6,7 @@ Master (working towards v2.5.0), 6/24/25
 * Python version of 2D Poisson solve tutorial (Julius Herb, #700).
 * Cached the optimal thread number (# physical cores) to reduce system call
   overhead in repeated small transforms (YuWei-CH, #697, fixing #693).
+* Adding OutputDriven parallelism to cufinufft (Barbone)
 * replaced LGPL-licensed Gauss-Legendre quadrature code by Apache2-licensed
   code adapted from Jason Kaye's cppdlr. CPU and GPU. PR #692 (Barnett).
 

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.19...3.30)
+cmake_minimum_required(VERSION 3.19...3.31)
 project(FINUFFT VERSION 2.4.0 LANGUAGES C CXX)
 
 # windows MSVC runtime flags policy
@@ -153,7 +153,13 @@ function(enable_asan target)
     endif()
 endfunction()
 
-set(CPM_DOWNLOAD_VERSION 0.40.5)
+set(CPM_DOWNLOAD_VERSION "0.40.5" CACHE STRING "Version of CPM.cmake to use")
+set(FFTW_VERSION "3.3.10" CACHE STRING "Version of FFTW to use")
+set(XTL_VERSION "0.7.7" CACHE STRING "Version of xtl to use")
+set(XSIMD_VERSION "13.2.0" CACHE STRING "Version of xsimd to use")
+set(DUCC0_VERSION "ducc0_0_36_0" CACHE STRING "Version of ducc0 to use")
+set(FINUFFT_CUDA11_CCCL_VERSION "2.8.4" CACHE STRING "Version of FINUFFT-cccl for cuda 11 to use")
+set(FINUFFT_CUDA12_CCCL_VERSION "3.0.0-rc7" CACHE STRING "Version of FINUFFT-cccl for cuda 12 to use")
 include(cmake/setupCPM.cmake)
 
 if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME)
@@ -171,10 +177,6 @@ if(FINUFFT_USE_CPU)
     if((APPLE) AND (CMAKE_CXX_COMPILER_ID STREQUAL "GNU"))
         add_link_options("-ld_classic")
     endif()
-    set(FFTW_VERSION 3.3.10)
-    set(XTL_VERSION 0.7.7)
-    set(XSIMD_VERSION 13.2.0)
-    set(DUCC0_VERSION ducc0_0_36_0)
     set(FINUFFT_FFTW_LIBRARIES)
     include(cmake/setupXSIMD.cmake)
     if(FINUFFT_USE_DUCC0)
@@ -191,7 +193,6 @@ if(FINUFFT_USE_CPU)
         find_package(OpenMP COMPONENTS C CXX REQUIRED)
     endif()
 endif()
-
 # check if -Wno-deprecated-declarations is supported
 check_cxx_compiler_flag(-Wno-deprecated-declarations FINUFFT_HAS_NO_DEPRECATED_DECLARATIONS)
 
@@ -287,13 +288,19 @@ if(FINUFFT_USE_CUDA)
     if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
         message(
             WARNING
-            "FINUFFT WARNING: No CUDA architecture supplied via '-DCMAKE_CUDA_ARCHITECTURES=...', defaulting to 'native'"
+            "FINUFFT WARNING: No CUDA architecture supplied via '-DCMAKE_CUDA_ARCHITECTURES=...', defaulting to 'native' See: https://developer.nvidia.com/cuda-gpus for more details on what architecture to supply."
+        )
+        detect_cuda_architecture()
+    endif()
+    if(CMAKE_CUDA_ARCHITECTURES MATCHES "compute_")
+        message(
+            FATAL_ERROR
+            "CMAKE_CUDA_ARCHITECTURES must be a list of integers like 70;75;86, not strings like compute_89"
         )
-        message(WARNING "See: https://developer.nvidia.com/cuda-gpus for more details on what architecture to supply.")
-        set(CMAKE_CUDA_ARCHITECTURES "native")
     endif()
     enable_language(CUDA)
     find_package(CUDAToolkit REQUIRED)
+    include(cmake/setupCCCL.cmake)
     add_subdirectory(src/cuda)
     if(BUILD_TESTING AND FINUFFT_BUILD_TESTS)
         add_subdirectory(perftest/cuda)

diff --git a/cmake/setupCCCL.cmake b/cmake/setupCCCL.cmake
@@ -0,0 +1,22 @@
+string(REPLACE "." ";" CUDA_VERSION_LIST ${CMAKE_CUDA_COMPILER_VERSION})
+list(GET CUDA_VERSION_LIST 0 CUDA_VERSION_MAJOR)
+message(STATUS "CUDA ${CUDA_VERSION_MAJOR} detected")
+if(CUDA_VERSION_MAJOR LESS 12)
+    CPMAddPackage(
+        NAME
+        CCCL
+        GIT_REPOSITORY
+        https://github.com/NVIDIA/cccl.git
+        GIT_TAG
+        v${FINUFFT_CUDA11_CCCL_VERSION}
+    )
+else()
+    CPMAddPackage(
+        NAME
+        CCCL
+        GIT_REPOSITORY
+        https://github.com/NVIDIA/cccl.git
+        GIT_TAG
+        v${FINUFFT_CUDA12_CCCL_VERSION}
+    )
+endif()
diff --git a/cmake/utils.cmake b/cmake/utils.cmake
@@ -86,3 +86,30 @@ else()
     message(WARNING "LTO is not supported: ${LTO_ERROR}")
     set(FINUFFT_INTERPROCEDURAL_OPTIMIZATION FALSE)
 endif()
+
+function(detect_cuda_architecture)
+    find_program(NVIDIA_SMI_EXECUTABLE nvidia-smi)
+
+    if(NVIDIA_SMI_EXECUTABLE)
+        execute_process(
+            COMMAND ${NVIDIA_SMI_EXECUTABLE} --query-gpu=compute_cap --format=csv,noheader
+            OUTPUT_VARIABLE compute_cap
+            OUTPUT_STRIP_TRAILING_WHITESPACE
+            ERROR_QUIET
+        )
+
+        if(compute_cap MATCHES "^[0-9]+\\.[0-9]+$")
+            string(REPLACE "." "" arch "${compute_cap}")
+            message(STATUS "Detected CUDA compute capability: ${compute_cap} (sm_${arch})")
+
+            # Pass as list of integers, not string
+            set(CMAKE_CUDA_ARCHITECTURES ${arch} PARENT_SCOPE)
+        else()
+            message(WARNING "Failed to parse compute capability: '${compute_cap}', defaulting to 70")
+            set(CMAKE_CUDA_ARCHITECTURES 70 PARENT_SCOPE)
+        endif()
+    else()
+        message(WARNING "nvidia-smi not found, defaulting CMAKE_CUDA_ARCHITECTURES to 70")
+        set(CMAKE_CUDA_ARCHITECTURES 70 PARENT_SCOPE)
+    endif()
+endfunction()
diff --git a/docs/ackn.rst b/docs/ackn.rst
@@ -47,6 +47,7 @@ Testing, bug reports, helpful discussions, contributions:
 * Reinhard Neder - fortran90 demo using finufft as module, OSX build
 * Vineet Bansal - py packaging
 * Jason Kaye - Gauss-Legendre quadrature code from cppdlr
+* Juan Ignacio Polanco - cuFINUFFT output driven
 * Julius Herb - Poisson equation tutorial in Python
 
 Logo design: `Sherry Choi <http://www.sherrychoi.com>`_ with input

diff --git a/docs/c_gpu.rst b/docs/c_gpu.rst
@@ -170,11 +170,11 @@ Given the user's desired dimension, number of Fourier modes in each direction, s
 
     Inputs:
 
-    type            type of the transform, 1 or 2 (note: 3 is not implemented yet)
-    dim             overall dimension of the transform, 2 or 3 (note: 1 is not implemented
-                    yet)
+    type            type of the transform, 1, 2 or 3
+    dim             overall dimension of the transform, 1, 2 or 3
     nmodes          a length-dim integer array: nmodes[d] is the number of Fourier modes in
                     (zero-indexed) direction d. Specifically,
+                    in 1D: nmodes[0]=N1,
                     in 2D: nmodes[0]=N1, nmodes[1]=N2,
                     in 3D: nmodes[0]=N1, nmodes[1]=N2, nmodes[2]=N3.
     iflag           if >=0, uses + sign in complex exponential, otherwise - sign
@@ -256,10 +256,14 @@ The result is written into whichever array was not the input (the roles of these
              (size M*ntransf complex array).
              If type 2, the output values at the nonuniform point targets
              (size M*ntransf complex array).
+             If type 3, the input strengths at the nonuniform point sources
+             (size M*ntransf complex array).
     f        If type 1, the output Fourier mode coefficients (size N1*N2*ntransf
              or N1*N2*N3*ntransf complex array, when dim = 2 or 3 respectively).
              If type 2, the input Fourier mode coefficients (size N1*N2*ntransf
              or N1*N2*N3*ntransf complex array, when dim = 2 or 3 respectively).
+             If type 3, the output Fourier mode coefficients (size N1*N2*ntransf
+             or N1*N2*N3*ntransf complex array, when dim = 2 or 3 respectively).
 
     Returns:
 
@@ -330,7 +334,9 @@ Algorithm performance options
 
 * ``gpu_method=2`` : for spreading only, ie, type 1 transforms, uses a shared memory output-block driven method, referred to as SM in our paper. Has no effect for interpolation (type 2 transforms).
 
-* ``gpu_method>2`` : (various upsupported experimental methods due to Melody Shih, not for regular users. Eg ``3`` tests an idea of Paul Springer's to group NU points when spreading, ``4`` is a block gather method of possible interest.)
+* ``gpu_method=3`` : for spreading only, ie, type 1 transforms, uses a shared memory output-block driven method with a different algorithm.
+
+* ``gpu_method>3`` : (various upsupported experimental methods due to Melody Shih, not for regular users. (``4`` is a block gather method of possible interest.)
 
 **gpu_sort**: ``0`` do not sort nonuniform points, ``1`` do sort nonuniform points. Only has an effect when ``gpu_method=1`` (or if this method has been internally chosen when ``gpu_method=0``). Unlike the CPU code, there is no auto-choice since in our experience sorting is fast and always helps. It is possible for structured NU point inputs that ``gpu_sort=0`` may be the faster.
 
@@ -344,8 +350,9 @@ Algorithm performance options
 
 **gpu_maxbatchsize**: ``0`` use heuristically defined batch size for vectorized (many-transforms with same NU points) interface, else set this batch size.
 
-**gpu_stream**: CUDA stream to use. Leave at default unless you know what you're doing. [To be documented]
+**gpu_stream**: CUDA stream to use. Leave at default unless you know what you're doing.
 
+**gpu_np**: Min batch size used for ``method 3`` (OD). It has to be a multiple of 16. It controls ho much of shared memory is left as GPU cache instead of being manually populated. Default is usually best.
 
 For all GPU option default values we refer to the source code in
 ``src/cuda/cufinufft.cu:cufinufft_default_opts``):

diff --git a/docs/impl_gpu.rst b/docs/impl_gpu.rst
@@ -0,0 +1,105 @@
+Implementation details
+======================
+
+This file contains detailed explanations of the algorithms and optimization strategies
+used in the library.
+
+The focus is on clarity and reproducibility of the core computational techniques,
+including spreading/interpolation schemes, memory access patterns, and kernel launch
+structures.
+
+.. note::
+
+   This is a living document. Implementation details are subject to change as
+   performance and accuracy improvements are integrated.
+
+Output Driven
+-------------
+
+The **output-driven spreading strategy** is designed to reduce global memory traffic and
+exploit shared memory locality. A CUDA block corresponds to a spatial tile in the output
+grid, and shared memory is used to accumulate updates from multiple nonuniform points.
+
+The process follows three main stages:
+
+1. **Per-thread kernel evaluation:**
+
+   Each thread computes the spreading kernel at a single NUFFT point.
+   Kernel values are stored into shared memory (`kerevals`) in a batched layout,
+   allowing reuse by all threads in the block.
+   `kerevals` is a 3D array with shape `(Np, dim, ns)`, where `Np` is the number of NUFFT points.
+   Using CUDA parallelism it is possible to evaluate all the kernel values in parallel accessing
+   `kerevals(thread.id, dim, 0)`. The third parameter is always 0 because `eval_kernel_vec`
+   takes a pointer and writes `ns` values in one go.
+   This corresponds to:
+
+   .. code-block:: cpp
+
+      eval_kernel_vec<T, ns>(&kerevals(i, 0, 0), x1, es_c, es_beta);
+
+2. **Thread-cooperative accumulation in shared memory:**
+
+   - Instead of assigning 1 thread per point (which would lead to shared memory collisions),
+     all threads iterate over a small batch (`Np`) of NUFFT points.
+     That is, the points are not processed in parallel, but the inner loop (tensor product) is.
+
+     The **Shared Memory (SM) approach** does:
+
+     .. code-block:: none
+
+        parallel for point = 0 to NumPoints
+          ...
+          for x = 0 to ns
+            for y = 0 to ns
+              for z = 0 to ns
+                ...
+
+     The **Output-driven approach** does:
+
+     For each point:
+
+     - Loop over NUFFT points sequentially.
+     - Parallelize over kernel grid entries using a flattened loop up to :math:`n_s^{\text{dim}}`.
+
+     Example pseudocode:
+
+     .. code-block:: none
+
+        for point = 0 to NumPoints, point+=np
+          ...
+          parallel for i = 0 to pow(ns, dim)
+            ...
+          ...
+
+     The parallelism is flipped: SM parallelizes the outer loop (over points), while
+     Output-driven parallelizes the inner loop (over the kernel values).
+     There is no collision because `local_subgrid` is accessed by `(ix, iy, iz)` — and these
+     are unique per thread as determined by the thread ID.
+     This removes the need for `AtomicAdd` on the local subgrid.
+
+3. **Atomic addition to global memory:**
+
+   Unchanged from SM: once all points have been processed and accumulated into `local_subgrid`,
+   the block performs an atomic write to global memory (`fw`). Since this step is
+   amortized over many points, its overhead is negligible.
+
+Memory Organization
+~~~~~~~~~~~~~~~~~~~
+
+- `kerevals`:
+  Stores kernel weights in shape `(Np, dim, ns)`. Threads access only their assigned batch rows.
+
+- `local_subgrid`:
+  A padded shared-memory grid with shape :math:`(bin\_size + padding)^{dim}`.
+  Where passing is :math:`padding = 2((ns+1)/2)`.
+  Threads write to disjoint sections during accumulation to avoid races.
+
+Design Insights
+~~~~~~~~~~~~~~~
+
+This hybrid parallelization combines **per-point parallelism** (step 1) and **spatial parallelism**
+(step 2):
+
+- Threads collaborate rather than compete on shared memory access.
+- Batching (`Np`) controls memory footprint and allows tuning for hardware constraints.
+- Synchronization barriers ensure correctness before accessing shared buffers.
diff --git a/docs/index.rst b/docs/index.rst
@@ -39,6 +39,7 @@ Documentation contents
    changelog
    nfft_migr
    cufinufft_migration
+   impl_gpu
    devnotes
    related
    users

diff --git a/include/cufinufft/common.h b/include/cufinufft/common.h
@@ -41,7 +41,7 @@ void onedim_nuft_kernel_precomp(T *f, T *zout, finufft_spread_opts opts);
 
 template<typename T>
 std::size_t shared_memory_required(int dim, int ns, int bin_size_x, int bin_size_y,
-                                   int bin_size_z);
+                                   int bin_size_z, int np);
 
 template<typename T>
 void cufinufft_setup_binsize(int type, int ns, int dim, cufinufft_opts *opts);
@@ -52,13 +52,12 @@ int cufinufft_set_shared_memory(V *kernel, const int dim,
   /**
    * WARNING: this function does not handle cuda errors. The caller should check them.
    */
-  int device_id{}, shared_mem_per_block{};
-  cudaGetDevice(&device_id);
-  const auto shared_mem_required =
-      shared_memory_required<T>(dim, d_plan.spopts.nspread, d_plan.opts.gpu_binsizex,
-                                d_plan.opts.gpu_binsizey, d_plan.opts.gpu_binsizez);
+  int shared_mem_per_block{};
+  const auto shared_mem_required = shared_memory_required<T>(
+      dim, d_plan.spopts.nspread, d_plan.opts.gpu_binsizex, d_plan.opts.gpu_binsizey,
+      d_plan.opts.gpu_binsizez, d_plan.opts.gpu_np);
   cudaDeviceGetAttribute(&shared_mem_per_block, cudaDevAttrMaxSharedMemoryPerBlockOptin,
-                         device_id);
+                         d_plan.opts.gpu_device_id);
   if (shared_mem_required > shared_mem_per_block) {
     fprintf(stderr,
             "Error: Shared memory required per block is %zu bytes, but the device "

diff --git a/include/cufinufft/contrib/helper_math.h b/include/cufinufft/contrib/helper_math.h
@@ -170,4 +170,28 @@ __host__ __device__ __forceinline__ cuFloatComplex operator/(
   return make_cuFloatComplex((a * cuCrealf(b)) / denom, (-a * cuCimagf(b)) / denom);
 }
 
+__host__ __device__ __forceinline__ cuDoubleComplex &operator+=(
+    cuDoubleComplex &a, const cuDoubleComplex &b) noexcept {
+  a = cuCadd(a, b);
+  return a;
+}
+
+__host__ __device__ __forceinline__ cuDoubleComplex &operator+=(cuDoubleComplex &a,
+                                                                double b) noexcept {
+  a.x += b;
+  return a;
+}
+
+__host__ __device__ __forceinline__ cuFloatComplex &operator+=(
+    cuFloatComplex &a, const cuFloatComplex &b) noexcept {
+  a = cuCaddf(a, b);
+  return a;
+}
+
+__host__ __device__ __forceinline__ cuFloatComplex &operator+=(cuFloatComplex &a,
+                                                               float b) noexcept {
+  a.x += b;
+  return a;
+}
+
 #endif // FINUFFT_INCLUDE_CUFINUFFT_CONTRIB_HELPER_MATH_H