Skip to content

Output driven parallelism #663

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 79 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
79 commits
Select commit Hold shift + click to select a range
3b91b4c
using adaptive bin size
DiamonDinoia Mar 28, 2025
e3826f2
using adaptive bin size but 1/6 for 1D
DiamonDinoia Mar 28, 2025
5fafb4f
using adaptive bin size but 1/6 for 1D
DiamonDinoia Mar 28, 2025
fba94ce
WIP
DiamonDinoia Apr 8, 2025
92b5909
WIP
DiamonDinoia Apr 10, 2025
0716f1f
new method in place
DiamonDinoia Apr 10, 2025
6b64e4e
WIP
DiamonDinoia Apr 11, 2025
e1b16f5
binsize revamp
DiamonDinoia Apr 15, 2025
9ddf241
dry run of method 3
DiamonDinoia Apr 15, 2025
930933e
first implementation
DiamonDinoia Apr 17, 2025
a24f7d5
first working implementation
DiamonDinoia Apr 18, 2025
db91a56
clean implementation in 3D
DiamonDinoia Apr 18, 2025
8f0060c
clean implementation in 3D
DiamonDinoia Apr 18, 2025
c0791ff
fixed 3D tests, WIP 1D/2D
DiamonDinoia Apr 18, 2025
31e0219
full spreading implemented
DiamonDinoia Apr 18, 2025
e769e9d
using dirft to compute full ffts
DiamonDinoia Apr 21, 2025
ae9896f
pretty prints fixed sync
DiamonDinoia Apr 21, 2025
947da6a
Merge branch 'cufinufft-improve-tests' into cufinufft-output-driven-p…
DiamonDinoia Apr 21, 2025
bf0e929
using replacements for span/mdspan
DiamonDinoia Apr 21, 2025
b11068d
using new span commit
DiamonDinoia Apr 22, 2025
2ae4e11
using ndrange
DiamonDinoia Apr 22, 2025
7e9630c
* using dirft to compute full ffts
DiamonDinoia Apr 21, 2025
5d6418b
Fixing tests.
DiamonDinoia Apr 28, 2025
0180303
Merge branch 'cufinufft-improve-tests' into cufinufft-output-driven-p…
DiamonDinoia Apr 28, 2025
9af6646
better intrinsics
DiamonDinoia Apr 29, 2025
3f5d485
trying using cccl for spans
DiamonDinoia Apr 29, 2025
459a8bb
Merge remote-tracking branch 'flatiron/master' into cufinufft-output-…
DiamonDinoia Apr 30, 2025
3838c6e
fixing atomicAdd (?)
DiamonDinoia Apr 30, 2025
87c0f39
better fix
DiamonDinoia Apr 30, 2025
9bb3f54
const in the wrong place
DiamonDinoia Apr 30, 2025
f450357
tuned
DiamonDinoia Apr 30, 2025
5628c97
small fix
DiamonDinoia Apr 30, 2025
3fba8a9
hopefilly atomic add works now
DiamonDinoia Apr 30, 2025
98fb114
update cccl
DiamonDinoia Apr 30, 2025
49bc410
Merge remote-tracking branch 'flatiron/master' into cufinufft-output-…
DiamonDinoia Apr 30, 2025
7c2fa63
using setup-cpp
DiamonDinoia May 1, 2025
4c841d1
updated binsize and method
DiamonDinoia May 1, 2025
743e87c
using windows 2022
DiamonDinoia May 1, 2025
daa1756
using windows 2022
DiamonDinoia May 1, 2025
bbf16ea
passing C++17 to msvc
DiamonDinoia May 1, 2025
878fd3b
do not download cccl if not needed
DiamonDinoia May 1, 2025
f694c5c
using c++20 on msvc in github
DiamonDinoia May 1, 2025
1c0a5f0
removing relaxed constexpr
DiamonDinoia May 1, 2025
cf1e4fb
removing unused includes
DiamonDinoia May 1, 2025
ee3b6fc
fixed cccl should always be included
DiamonDinoia May 1, 2025
7aca4a8
tweaks
DiamonDinoia May 2, 2025
b44845a
fixing cmake
DiamonDinoia May 2, 2025
70d6689
fixed tests
DiamonDinoia May 2, 2025
f7dac0b
fix-load library
DiamonDinoia May 2, 2025
1782b99
cmake cuda architectures fix
DiamonDinoia May 2, 2025
b693f55
using correct compute cap
DiamonDinoia May 2, 2025
b46406d
optimized interp
DiamonDinoia May 2, 2025
9b59c50
removing debug prints
DiamonDinoia May 2, 2025
c093b86
fixed cmake and removed hanging out code
DiamonDinoia May 2, 2025
1c48ad5
restore helper math
DiamonDinoia May 2, 2025
e2e76ad
restore math test
DiamonDinoia May 2, 2025
e3df9c1
Revert last two commits
DiamonDinoia May 2, 2025
63382cd
added documentation
DiamonDinoia May 5, 2025
aa7d056
added documentation
DiamonDinoia May 5, 2025
a48fda7
added documentation
DiamonDinoia May 5, 2025
c52fb06
small doc fixes
DiamonDinoia May 5, 2025
e171cdb
restore win-2019
DiamonDinoia May 5, 2025
c2c0a2a
restore win-2019 properly
DiamonDinoia May 5, 2025
7f98bf1
restore win-2022
DiamonDinoia May 5, 2025
ce3e2a0
Fixed multi GPU, removed code duplication
DiamonDinoia May 7, 2025
53b589d
Merge remote-tracking branch 'flatiron/master' into cufinufft-output-…
DiamonDinoia May 7, 2025
6ae58e9
adapting cccl
DiamonDinoia May 9, 2025
869795b
intrinsics wrapper
DiamonDinoia May 9, 2025
5d750da
fixed clang build
DiamonDinoia May 9, 2025
d409480
Merge remote-tracking branch 'flatiron/master' into cufinufft-output-…
DiamonDinoia Jun 20, 2025
915ade3
Robert fixes
DiamonDinoia Jun 20, 2025
0315504
fixed cuda detection message
DiamonDinoia Jun 20, 2025
3854656
fixed Robert's comments
DiamonDinoia Jun 20, 2025
14481f6
fixed thread detection
DiamonDinoia Jun 20, 2025
705b8fe
documentation comments fix
DiamonDinoia Jun 20, 2025
d037a27
updated documentation
DiamonDinoia Jun 20, 2025
79a429f
rename vp_sm
DiamonDinoia Jun 20, 2025
130e100
rename vp_sm
DiamonDinoia Jun 20, 2025
f2806c1
Merge remote-tracking branch 'flatiron/master' into cufinufft-output-…
DiamonDinoia Jun 24, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 13 additions & 4 deletions .github/workflows/build_cufinufft_wheels.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,22 @@ jobs:
matrix:
buildplat:
- [ ubuntu-22.04, manylinux_x86_64 ]
- [ windows-2019, win_amd64 ]
- [ windows-2022, win_amd64 ]
steps:
- uses: actions/checkout@v4
- uses: ilammy/msvc-dev-cmd@v1
- name: Setup Cpp
if: ${{ matrix.buildplat[0] == 'windows-2022' }}
uses: aminya/[email protected]
with:
compiler: msvc-2022
vcvarsall: true
cmake: true
ninja: true
vcpkg: false
cppcheck: false
- name: Setup CUDA
if: ${{ matrix.buildplat[0] == 'windows-2019' }}
uses: Jimver/[email protected].21
if: ${{ matrix.buildplat[0] == 'windows-2022' }}
uses: Jimver/[email protected].23
with:
cuda: '12.4.0'
- name: Build ${{ matrix.buildplat[1] }} wheels
Expand Down
1 change: 1 addition & 0 deletions CHANGELOG
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ Master (working towards v2.5.0), 6/24/25
* Python version of 2D Poisson solve tutorial (Julius Herb, #700).
* Cached the optimal thread number (# physical cores) to reduce system call
overhead in repeated small transforms (YuWei-CH, #697, fixing #693).
* Adding OutputDriven parallelism to cufinufft (Barbone)
* replaced LGPL-licensed Gauss-Legendre quadrature code by Apache2-licensed
code adapted from Jason Kaye's cppdlr. CPU and GPU. PR #692 (Barnett).

Expand Down
27 changes: 17 additions & 10 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
cmake_minimum_required(VERSION 3.19...3.30)
cmake_minimum_required(VERSION 3.19...3.31)
project(FINUFFT VERSION 2.4.0 LANGUAGES C CXX)

# windows MSVC runtime flags policy
Expand Down Expand Up @@ -153,7 +153,13 @@ function(enable_asan target)
endif()
endfunction()

set(CPM_DOWNLOAD_VERSION 0.40.5)
set(CPM_DOWNLOAD_VERSION "0.40.5" CACHE STRING "Version of CPM.cmake to use")
set(FFTW_VERSION "3.3.10" CACHE STRING "Version of FFTW to use")
set(XTL_VERSION "0.7.7" CACHE STRING "Version of xtl to use")
set(XSIMD_VERSION "13.2.0" CACHE STRING "Version of xsimd to use")
set(DUCC0_VERSION "ducc0_0_36_0" CACHE STRING "Version of ducc0 to use")
set(FINUFFT_CUDA11_CCCL_VERSION "2.8.4" CACHE STRING "Version of FINUFFT-cccl for cuda 11 to use")
set(FINUFFT_CUDA12_CCCL_VERSION "3.0.0-rc7" CACHE STRING "Version of FINUFFT-cccl for cuda 12 to use")
include(cmake/setupCPM.cmake)

if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME)
Expand All @@ -171,10 +177,6 @@ if(FINUFFT_USE_CPU)
if((APPLE) AND (CMAKE_CXX_COMPILER_ID STREQUAL "GNU"))
add_link_options("-ld_classic")
endif()
set(FFTW_VERSION 3.3.10)
set(XTL_VERSION 0.7.7)
set(XSIMD_VERSION 13.2.0)
set(DUCC0_VERSION ducc0_0_36_0)
set(FINUFFT_FFTW_LIBRARIES)
include(cmake/setupXSIMD.cmake)
if(FINUFFT_USE_DUCC0)
Expand All @@ -191,7 +193,6 @@ if(FINUFFT_USE_CPU)
find_package(OpenMP COMPONENTS C CXX REQUIRED)
endif()
endif()

# check if -Wno-deprecated-declarations is supported
check_cxx_compiler_flag(-Wno-deprecated-declarations FINUFFT_HAS_NO_DEPRECATED_DECLARATIONS)

Expand Down Expand Up @@ -287,13 +288,19 @@ if(FINUFFT_USE_CUDA)
if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
message(
WARNING
"FINUFFT WARNING: No CUDA architecture supplied via '-DCMAKE_CUDA_ARCHITECTURES=...', defaulting to 'native'"
"FINUFFT WARNING: No CUDA architecture supplied via '-DCMAKE_CUDA_ARCHITECTURES=...', defaulting to 'native' See: https://developer.nvidia.com/cuda-gpus for more details on what architecture to supply."
)
detect_cuda_architecture()
endif()
if(CMAKE_CUDA_ARCHITECTURES MATCHES "compute_")
message(
FATAL_ERROR
"CMAKE_CUDA_ARCHITECTURES must be a list of integers like 70;75;86, not strings like compute_89"
)
message(WARNING "See: https://developer.nvidia.com/cuda-gpus for more details on what architecture to supply.")
set(CMAKE_CUDA_ARCHITECTURES "native")
endif()
enable_language(CUDA)
find_package(CUDAToolkit REQUIRED)
include(cmake/setupCCCL.cmake)
add_subdirectory(src/cuda)
if(BUILD_TESTING AND FINUFFT_BUILD_TESTS)
add_subdirectory(perftest/cuda)
Expand Down
22 changes: 22 additions & 0 deletions cmake/setupCCCL.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
string(REPLACE "." ";" CUDA_VERSION_LIST ${CMAKE_CUDA_COMPILER_VERSION})
list(GET CUDA_VERSION_LIST 0 CUDA_VERSION_MAJOR)
message(STATUS "CUDA ${CUDA_VERSION_MAJOR} detected")
if(CUDA_VERSION_MAJOR LESS 12)
CPMAddPackage(
NAME
CCCL
GIT_REPOSITORY
https://github.com/NVIDIA/cccl.git
GIT_TAG
v${FINUFFT_CUDA11_CCCL_VERSION}
)
else()
CPMAddPackage(
NAME
CCCL
GIT_REPOSITORY
https://github.com/NVIDIA/cccl.git
GIT_TAG
v${FINUFFT_CUDA12_CCCL_VERSION}
)
endif()
27 changes: 27 additions & 0 deletions cmake/utils.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -86,3 +86,30 @@ else()
message(WARNING "LTO is not supported: ${LTO_ERROR}")
set(FINUFFT_INTERPROCEDURAL_OPTIMIZATION FALSE)
endif()

function(detect_cuda_architecture)
find_program(NVIDIA_SMI_EXECUTABLE nvidia-smi)

if(NVIDIA_SMI_EXECUTABLE)
execute_process(
COMMAND ${NVIDIA_SMI_EXECUTABLE} --query-gpu=compute_cap --format=csv,noheader
OUTPUT_VARIABLE compute_cap
OUTPUT_STRIP_TRAILING_WHITESPACE
ERROR_QUIET
)

if(compute_cap MATCHES "^[0-9]+\\.[0-9]+$")
string(REPLACE "." "" arch "${compute_cap}")
message(STATUS "Detected CUDA compute capability: ${compute_cap} (sm_${arch})")

# Pass as list of integers, not string
set(CMAKE_CUDA_ARCHITECTURES ${arch} PARENT_SCOPE)
else()
message(WARNING "Failed to parse compute capability: '${compute_cap}', defaulting to 70")
set(CMAKE_CUDA_ARCHITECTURES 70 PARENT_SCOPE)
endif()
else()
message(WARNING "nvidia-smi not found, defaulting CMAKE_CUDA_ARCHITECTURES to 70")
set(CMAKE_CUDA_ARCHITECTURES 70 PARENT_SCOPE)
endif()
endfunction()
1 change: 1 addition & 0 deletions docs/ackn.rst
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ Testing, bug reports, helpful discussions, contributions:
* Reinhard Neder - fortran90 demo using finufft as module, OSX build
* Vineet Bansal - py packaging
* Jason Kaye - Gauss-Legendre quadrature code from cppdlr
* Juan Ignacio Polanco - cuFINUFFT output driven
* Julius Herb - Poisson equation tutorial in Python

Logo design: `Sherry Choi <http://www.sherrychoi.com>`_ with input
Expand Down
17 changes: 12 additions & 5 deletions docs/c_gpu.rst
Original file line number Diff line number Diff line change
Expand Up @@ -170,11 +170,11 @@ Given the user's desired dimension, number of Fourier modes in each direction, s

Inputs:

type type of the transform, 1 or 2 (note: 3 is not implemented yet)
dim overall dimension of the transform, 2 or 3 (note: 1 is not implemented
yet)
type type of the transform, 1, 2 or 3
dim overall dimension of the transform, 1, 2 or 3
nmodes a length-dim integer array: nmodes[d] is the number of Fourier modes in
(zero-indexed) direction d. Specifically,
in 1D: nmodes[0]=N1,
in 2D: nmodes[0]=N1, nmodes[1]=N2,
in 3D: nmodes[0]=N1, nmodes[1]=N2, nmodes[2]=N3.
iflag if >=0, uses + sign in complex exponential, otherwise - sign
Expand Down Expand Up @@ -256,10 +256,14 @@ The result is written into whichever array was not the input (the roles of these
(size M*ntransf complex array).
If type 2, the output values at the nonuniform point targets
(size M*ntransf complex array).
If type 3, the input strengths at the nonuniform point sources
(size M*ntransf complex array).
f If type 1, the output Fourier mode coefficients (size N1*N2*ntransf
or N1*N2*N3*ntransf complex array, when dim = 2 or 3 respectively).
If type 2, the input Fourier mode coefficients (size N1*N2*ntransf
or N1*N2*N3*ntransf complex array, when dim = 2 or 3 respectively).
If type 3, the output Fourier mode coefficients (size N1*N2*ntransf
or N1*N2*N3*ntransf complex array, when dim = 2 or 3 respectively).

Returns:

Expand Down Expand Up @@ -330,7 +334,9 @@ Algorithm performance options

* ``gpu_method=2`` : for spreading only, ie, type 1 transforms, uses a shared memory output-block driven method, referred to as SM in our paper. Has no effect for interpolation (type 2 transforms).

* ``gpu_method>2`` : (various upsupported experimental methods due to Melody Shih, not for regular users. Eg ``3`` tests an idea of Paul Springer's to group NU points when spreading, ``4`` is a block gather method of possible interest.)
* ``gpu_method=3`` : for spreading only, ie, type 1 transforms, uses a shared memory output-block driven method with a different algorithm.

* ``gpu_method>3`` : (various upsupported experimental methods due to Melody Shih, not for regular users. (``4`` is a block gather method of possible interest.)

**gpu_sort**: ``0`` do not sort nonuniform points, ``1`` do sort nonuniform points. Only has an effect when ``gpu_method=1`` (or if this method has been internally chosen when ``gpu_method=0``). Unlike the CPU code, there is no auto-choice since in our experience sorting is fast and always helps. It is possible for structured NU point inputs that ``gpu_sort=0`` may be the faster.

Expand All @@ -344,8 +350,9 @@ Algorithm performance options

**gpu_maxbatchsize**: ``0`` use heuristically defined batch size for vectorized (many-transforms with same NU points) interface, else set this batch size.

**gpu_stream**: CUDA stream to use. Leave at default unless you know what you're doing. [To be documented]
**gpu_stream**: CUDA stream to use. Leave at default unless you know what you're doing.

**gpu_np**: Min batch size used for ``method 3`` (OD). It has to be a multiple of 16. It controls ho much of shared memory is left as GPU cache instead of being manually populated. Default is usually best.

For all GPU option default values we refer to the source code in
``src/cuda/cufinufft.cu:cufinufft_default_opts``):
Expand Down
105 changes: 105 additions & 0 deletions docs/impl_gpu.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
Implementation details
======================

This file contains detailed explanations of the algorithms and optimization strategies
used in the library.

The focus is on clarity and reproducibility of the core computational techniques,
including spreading/interpolation schemes, memory access patterns, and kernel launch
structures.

.. note::

This is a living document. Implementation details are subject to change as
performance and accuracy improvements are integrated.

Output Driven
-------------

The **output-driven spreading strategy** is designed to reduce global memory traffic and
exploit shared memory locality. A CUDA block corresponds to a spatial tile in the output
grid, and shared memory is used to accumulate updates from multiple nonuniform points.

The process follows three main stages:

1. **Per-thread kernel evaluation:**

Each thread computes the spreading kernel at a single NUFFT point.
Kernel values are stored into shared memory (`kerevals`) in a batched layout,
allowing reuse by all threads in the block.
`kerevals` is a 3D array with shape `(Np, dim, ns)`, where `Np` is the number of NUFFT points.
Using CUDA parallelism it is possible to evaluate all the kernel values in parallel accessing
`kerevals(thread.id, dim, 0)`. The third parameter is always 0 because `eval_kernel_vec`
takes a pointer and writes `ns` values in one go.
This corresponds to:

.. code-block:: cpp

eval_kernel_vec<T, ns>(&kerevals(i, 0, 0), x1, es_c, es_beta);

2. **Thread-cooperative accumulation in shared memory:**

- Instead of assigning 1 thread per point (which would lead to shared memory collisions),
all threads iterate over a small batch (`Np`) of NUFFT points.
That is, the points are not processed in parallel, but the inner loop (tensor product) is.

The **Shared Memory (SM) approach** does:

.. code-block:: none

parallel for point = 0 to NumPoints
...
for x = 0 to ns
for y = 0 to ns
for z = 0 to ns
...

The **Output-driven approach** does:

For each point:

- Loop over NUFFT points sequentially.
- Parallelize over kernel grid entries using a flattened loop up to :math:`n_s^{\text{dim}}`.

Example pseudocode:

.. code-block:: none

for point = 0 to NumPoints, point+=np
...
parallel for i = 0 to pow(ns, dim)
...
...

The parallelism is flipped: SM parallelizes the outer loop (over points), while
Output-driven parallelizes the inner loop (over the kernel values).
There is no collision because `local_subgrid` is accessed by `(ix, iy, iz)` — and these
are unique per thread as determined by the thread ID.
This removes the need for `AtomicAdd` on the local subgrid.

3. **Atomic addition to global memory:**

Unchanged from SM: once all points have been processed and accumulated into `local_subgrid`,
the block performs an atomic write to global memory (`fw`). Since this step is
amortized over many points, its overhead is negligible.

Memory Organization
~~~~~~~~~~~~~~~~~~~

- `kerevals`:
Stores kernel weights in shape `(Np, dim, ns)`. Threads access only their assigned batch rows.

- `local_subgrid`:
A padded shared-memory grid with shape :math:`(bin\_size + padding)^{dim}`.
Where passing is :math:`padding = 2((ns+1)/2)`.
Threads write to disjoint sections during accumulation to avoid races.

Design Insights
~~~~~~~~~~~~~~~

This hybrid parallelization combines **per-point parallelism** (step 1) and **spatial parallelism**
(step 2):

- Threads collaborate rather than compete on shared memory access.
- Batching (`Np`) controls memory footprint and allows tuning for hardware constraints.
- Synchronization barriers ensure correctness before accessing shared buffers.
1 change: 1 addition & 0 deletions docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ Documentation contents
changelog
nfft_migr
cufinufft_migration
impl_gpu
devnotes
related
users
Expand Down
13 changes: 6 additions & 7 deletions include/cufinufft/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ void onedim_nuft_kernel_precomp(T *f, T *zout, finufft_spread_opts opts);

template<typename T>
std::size_t shared_memory_required(int dim, int ns, int bin_size_x, int bin_size_y,
int bin_size_z);
int bin_size_z, int np);

template<typename T>
void cufinufft_setup_binsize(int type, int ns, int dim, cufinufft_opts *opts);
Expand All @@ -52,13 +52,12 @@ int cufinufft_set_shared_memory(V *kernel, const int dim,
/**
* WARNING: this function does not handle cuda errors. The caller should check them.
*/
int device_id{}, shared_mem_per_block{};
cudaGetDevice(&device_id);
const auto shared_mem_required =
shared_memory_required<T>(dim, d_plan.spopts.nspread, d_plan.opts.gpu_binsizex,
d_plan.opts.gpu_binsizey, d_plan.opts.gpu_binsizez);
int shared_mem_per_block{};
const auto shared_mem_required = shared_memory_required<T>(
dim, d_plan.spopts.nspread, d_plan.opts.gpu_binsizex, d_plan.opts.gpu_binsizey,
d_plan.opts.gpu_binsizez, d_plan.opts.gpu_np);
cudaDeviceGetAttribute(&shared_mem_per_block, cudaDevAttrMaxSharedMemoryPerBlockOptin,
device_id);
d_plan.opts.gpu_device_id);
if (shared_mem_required > shared_mem_per_block) {
fprintf(stderr,
"Error: Shared memory required per block is %zu bytes, but the device "
Expand Down
24 changes: 24 additions & 0 deletions include/cufinufft/contrib/helper_math.h
Original file line number Diff line number Diff line change
Expand Up @@ -170,4 +170,28 @@ __host__ __device__ __forceinline__ cuFloatComplex operator/(
return make_cuFloatComplex((a * cuCrealf(b)) / denom, (-a * cuCimagf(b)) / denom);
}

__host__ __device__ __forceinline__ cuDoubleComplex &operator+=(
cuDoubleComplex &a, const cuDoubleComplex &b) noexcept {
a = cuCadd(a, b);
return a;
}

__host__ __device__ __forceinline__ cuDoubleComplex &operator+=(cuDoubleComplex &a,
double b) noexcept {
a.x += b;
return a;
}

__host__ __device__ __forceinline__ cuFloatComplex &operator+=(
cuFloatComplex &a, const cuFloatComplex &b) noexcept {
a = cuCaddf(a, b);
return a;
}

__host__ __device__ __forceinline__ cuFloatComplex &operator+=(cuFloatComplex &a,
float b) noexcept {
a.x += b;
return a;
}

#endif // FINUFFT_INCLUDE_CUFINUFFT_CONTRIB_HELPER_MATH_H
Loading
Loading