Skip to content

algo: implement sine/cosine based on Intel SVML for single/double precision #1

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 20 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
a3751c4
algo: implement sine/cosine using Intel SVML for single/double precision
seiko2plus Jun 4, 2025
e23f065
BUG: Fix Precise options and doc comment for it
seiko2plus Jun 4, 2025
885b3ba
Add code generation tool for SIMD routines
seiko2plus Jun 10, 2025
973f3a2
Add spin build system configuration
seiko2plus Jun 10, 2025
2bf9b03
Add SIMD intrinsics testing framework
seiko2plus Jun 10, 2025
390fcb8
replace sin/cos lut with the new generators
seiko2plus Jun 10, 2025
9743c73
Add Highway library as git submodule
seiko2plus Jun 10, 2025
93c7e10
Add npsr meson build placeholder
seiko2plus Jun 10, 2025
4c41d23
Add comprehensive stress tests for sin/cos functions
seiko2plus Jun 10, 2025
4188588
Remove Sin/Cos bindings from precision template
seiko2plus Jun 10, 2025
2edd3d7
Bind Sin/Cos intrinsics to Python interface
seiko2plus Jun 10, 2025
b659308
Fix polynomial evaluation for large arguments in double precision
seiko2plus Jun 10, 2025
2de0607
Fix small argument handling and add cosine support for double precision
seiko2plus Jun 10, 2025
66bdf4d
Refactor: Extract Precise class to separate header
seiko2plus Jun 10, 2025
e47f828
Refactor: Move Precise parameter to first position in function signat…
seiko2plus Jun 10, 2025
1a6c7ee
Install sin/cos test cases in build system
seiko2plus Jun 10, 2025
88e58e9
Add test requirements file
seiko2plus Jun 10, 2025
a84cb48
remove unit tests (will be included in a separate PR)
seiko2plus Jul 28, 2025
ecd1f2f
refactor: reimplement Sollya generator using only Sollya scripts
seiko2plus Jul 28, 2025
300b6bc
remove auto-generated data (will be added in a separate PR)
seiko2plus Jul 28, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
120 changes: 120 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
# Editor temporary/working/backup files #
#########################################
.#*
[#]*#
*~
*$
*.bak
*.diff
.idea/
*.iml
*.ipr
*.iws
*.org
.project
pmip
*.rej
.settings/
.*.sw[nop]
.sw[nop]
*.tmp
*.vim
.vscode
tags
cscope.out
# gnu global
GPATH
GRTAGS
GSYMS
GTAGS
.cache
.mypy_cache/

# Compiled source #
###################
*.a
*.com
*.class
*.dll
*.exe
*.o
*.o.d
*.py[ocd]
*.so
*.mod

# Packages #
############
# it's better to unpack these files and commit the raw source
# git has its own built in compression methods
*.7z
*.bz2
*.bzip2
*.dmg
*.gz
*.iso
*.jar
*.rar
*.tar
*.tbz2
*.tgz
*.zip

# Python files #
################
# meson build/installation directories
build
build-install
# meson python output
.mesonpy-native-file.ini
# sphinx build directory
_build
# dist directory is where sdist/wheel end up
dist
doc/build
doc/docenv
doc/cdoc/build
# Egg metadata
*.egg-info
# The shelf plugin uses this dir
./.shelf
.cache
pip-wheel-metadata
.python-version
# virtual envs
numpy-dev/
venv/

# Paver generated files #
#########################
/release

# Logs and databases #
######################
*.log
*.sql
*.sqlite

# Patches #
###########
*.patch
*.diff

# Do not ignore the following patches: #
########################################
!tools/ci/emscripten/0001-do-not-set-meson-environment-variable-pyodide-gh-4502.patch

# OS generated files #
######################
.DS_Store*
.VolumeIcon.icns
.fseventsd
Icon?
.gdb_history
ehthumbs.db
Thumbs.db
.directory

# pytest generated files #
##########################
/.pytest_cache
18 changes: 18 additions & 0 deletions .spin/cmds.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
import os
import pathlib
import sys
import click
import spin
from spin.cmds import meson

curdir = pathlib.Path(__file__).parent
rootdir = curdir.parent


@click.command(help="Generate sollya python based files")
@click.option("-f", "--force", is_flag=True, help="Force regenerate all files")
def generate(*, force):
spin.util.run(
["python", str(rootdir / "tools" / "sollya" / "generate.py")]
+ (["--force"] if force else []),
)
37 changes: 37 additions & 0 deletions npsr/common.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#ifndef NUMPY_SIMD_ROUTINES_NPSR_COMMON_H_
#define NUMPY_SIMD_ROUTINES_NPSR_COMMON_H_

#include <hwy/highway.h>

Check failure on line 4 in npsr/common.h

View workflow job for this annotation

GitHub Actions / cpp-linter

npsr/common.h:4:10 [clang-diagnostic-error]

'hwy/highway.h' file not found

#include <cfenv>
#include <type_traits>

#include "precise.h"

#endif // NUMPY_SIMD_ROUTINES_NPSR_COMMON_H_

#if defined(NUMPY_SIMD_ROUTINES_NPSR_COMMON_FOREACH_H_) == \
defined(HWY_TARGET_TOGGLE) // NOLINT
#ifdef NUMPY_SIMD_ROUTINES_NPSR_COMMON_FOREACH_H_
#undef NUMPY_SIMD_ROUTINES_NPSR_COMMON_FOREACH_H_
#else
#define NUMPY_SIMD_ROUTINES_NPSR_COMMON_FOREACH_H_
#endif

HWY_BEFORE_NAMESPACE();
namespace npsr::HWY_NAMESPACE {

Check warning on line 22 in npsr/common.h

View workflow job for this annotation

GitHub Actions / cpp-linter

npsr/common.h:22:11 [cppcoreguidelines-avoid-non-const-global-variables]

variable 'npsr' is non-const and globally accessible, consider making it const
namespace hn = hwy::HWY_NAMESPACE;
using hn::DFromV;
using hn::MFromD;
using hn::Rebind;
using hn::RebindToUnsigned;
using hn::TFromD;
using hn::TFromV;
using hn::VFromD;
constexpr bool kNativeFMA = HWY_NATIVE_FMA != 0;

HWY_ATTR void DummyToSuppressUnusedWarning() {}
} // namespace npsr::HWY_NAMESPACE
HWY_AFTER_NAMESPACE();

#endif // NUMPY_SIMD_ROUTINES_NPSR_COMMON_FOREACH_H_
12 changes: 12 additions & 0 deletions npsr/npsr.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
// To include them once per target, which is ensured by the toggle check.
// clang-format off
#if defined(_NPSR_NPSR_H_) == defined(HWY_TARGET_TOGGLE) // NOLINT
#ifdef _NPSR_NPSR_H_
#undef _NPSR_NPSR_H_
#else
#define _NPSR_NPSR_H_

Check warning on line 7 in npsr/npsr.h

View workflow job for this annotation

GitHub Actions / cpp-linter

npsr/npsr.h:7:9 [bugprone-reserved-identifier]

declaration uses identifier '_NPSR_NPSR_H_', which is a reserved identifier
#endif

#include "npsr/trig/inl.h"

Check failure on line 10 in npsr/npsr.h

View workflow job for this annotation

GitHub Actions / cpp-linter

npsr/npsr.h:10:10 [clang-diagnostic-error]

'npsr/trig/inl.h' file not found

#endif // _NPSR_NPSR_H_
151 changes: 151 additions & 0 deletions npsr/precise.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
#ifndef NUMPY_SIMD_ROUTINES_NPSR_PRECISE_H_
#define NUMPY_SIMD_ROUTINES_NPSR_PRECISE_H_
#include <cfenv>

Check failure on line 3 in npsr/precise.h

View workflow job for this annotation

GitHub Actions / cpp-linter

npsr/precise.h:3:10 [clang-diagnostic-error]

'cfenv' file not found
#include <type_traits>

namespace npsr {

Check warning on line 6 in npsr/precise.h

View workflow job for this annotation

GitHub Actions / cpp-linter

npsr/precise.h:6:11 [cppcoreguidelines-avoid-non-const-global-variables]

variable 'npsr' is non-const and globally accessible, consider making it const

struct _NoLargeArgument {};
struct _NoSpecialCases {};
struct _NoExceptions {};
struct _LowAccuracy {};
constexpr auto kNoLargeArgument = _NoLargeArgument{};
constexpr auto kNoSpecialCases = _NoSpecialCases{};
constexpr auto kNoExceptions = _NoExceptions{};
constexpr auto kLowAccuracy = _LowAccuracy{};

struct Round {
struct _Force {};
static constexpr auto kForce = _Force{};
};

struct Subnormal {
struct _DAZ {};
struct _FTZ {};
struct _IEEE754 {};
static constexpr auto kDAZ = _DAZ{};
static constexpr auto kFTZ = _FTZ{};
static constexpr auto kIEEE754 = _IEEE754{};
};

struct FPExceptions {
static constexpr auto kNone = 0;
static constexpr auto kInvalid = FE_INVALID;
static constexpr auto kDivByZero = FE_DIVBYZERO;
static constexpr auto kOverflow = FE_OVERFLOW;
static constexpr auto kUnderflow = FE_UNDERFLOW;
};

/**
* @brief RAII floating-point precision control class
*
* The Precise class provides automatic management of floating-point
* environment settings during its lifetime. It uses RAII principles to save
* the current floating-point state on construction and restore it on
* destruction.
*
* The class is configured using variadic template arguments that specify
* the desired floating-point behavior through tag types.
*
* **IMPORTANT PERFORMANCE NOTE**: Create the Precise object BEFORE loops,
* not inside them. The constructor and destructor have overhead from saving
* and restoring floating-point state, so it should be done once per
* computational scope, not per iteration.
*
* @tparam Args Variadic template arguments for configuration flags
*
* @example
* ```cpp
* using namespace hwy::HWY_NAMESPACE;
* using namespace npsr;
* using namespace npsr::HWY_NAMESPACE;
*
* Precise precise = {kLowAccuracy, kNoSpecialCases, kNoLargeArgument};
* const ScalableTag<float> d;
* typename V = Vec<DFromV<SclableTag>>;
* for (size_t i = 0; i < n; i += Lanes(d)) {
* V input = LoadU(d, &input[i]);
* V result = Sin(precise, input);
* StoreU(result, d, &output[i]);
* }
* ```
*/
template <typename... Args>
class Precise {
public:
Precise() {
if constexpr (!kNoExceptions) {
fegetexceptflag(&_exceptions, FE_ALL_EXCEPT);
}
if constexpr (kRoundForce) {
_rounding_mode = fegetround();
int new_mode = _NewRoundingMode();
if (_rounding_mode != new_mode) {
_retrieve_rounding_mode = true;
fesetround(new_mode);
}
}
}
template <typename T1, typename... Rest>
Precise(T1&& arg1, Rest&&... rest) {}

void FlushExceptions() { fesetexceptflag(&_exceptions, FE_ALL_EXCEPT); }

void Raise(int errors) {
static_assert(!kNoExceptions,
"Cannot raise exceptions in NoExceptions mode");
_exceptions |= errors;
}
~Precise() {
FlushExceptions();
if constexpr (kRoundForce) {
if (_retrieve_rounding_mode) {
fesetround(_rounding_mode);
}
}
}
static constexpr bool kNoExceptions =
(std::is_same_v<_NoExceptions, Args> || ...);
static constexpr bool kNoLargeArgument =
(std::is_same_v<_NoLargeArgument, Args> || ...);
static constexpr bool kNoSpecialCases =
(std::is_same_v<_NoSpecialCases, Args> || ...);
static constexpr bool kLowAccuracy =
(std::is_same_v<_LowAccuracy, Args> || ...);
// defaults to high accuracy if no low accuracy flag is set
static constexpr bool kHighAccuracy = !kLowAccuracy;
// defaults to large argument support if no no large argument flag is set
static constexpr bool kLargeArgument = !kNoLargeArgument;
// defaults to special cases support if no no special cases flag is set
static constexpr bool kSpecialCases = !kNoSpecialCases;
// defaults to exception support if no no exception flag is set
static constexpr bool kExceptions = !kNoExceptions;

static constexpr bool kRoundForce =
(std::is_same_v<Round::_Force, Args> || ...);

static constexpr bool kDAZ = (std::is_same_v<Subnormal::_DAZ, Args> || ...);
static constexpr bool kFTZ = (std::is_same_v<Subnormal::_FTZ, Args> || ...);
static constexpr bool _kIEEE754 =
(std::is_same_v<Subnormal::_IEEE754, Args> || ...);
static_assert(!_kIEEE754 || !(kDAZ || kFTZ),
"IEEE754 mode cannot be used "
"with Denormals Are Zero (DAZ) or Flush To Zero (FTZ) "
"subnormal handling");
static constexpr bool kIEEE754 = _kIEEE754 || !(kDAZ || kFTZ);

private:
int _NewRoundingMode() const { return FE_TONEAREST; }
int _rounding_mode = 0;
bool _retrieve_rounding_mode = false;
fexcept_t _exceptions;
};

Precise() -> Precise<>;

// For Precise{args...} -> Precise<decltype(args)...>
template <typename T1, typename... Rest>
Precise(T1&&, Rest&&...) -> Precise<std::decay_t<T1>, std::decay_t<Rest>...>;

} // namespace npsr
#endif // NUMPY_SIMD_ROUTINES_NPSR_PRECISE_H_
52 changes: 52 additions & 0 deletions npsr/trig/data/approx.h.sol
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
suppressmessage(186, 185, 184);

procedure ApproxLut4_(pT, pFunc, pFuncDriv) {
var r, i, $;

$.num_lut = match pT.kSize
with 64: (2^9)
default: (2^8);

$.low_round = match pT.kSize
with 64: ([|24, RZ|])
default: ([|pT.kDigits, RN|]);
$.scale = 2.0 * pi / $.num_lut;

r = [||];
for i from 0 to $.num_lut - 1 do {
$.angle = i * $.scale;
$.exact = pFunc($.angle);
$.high = pT.kRound($.exact);
$.low = pT.kRound(round($.exact - $.high, $.low_round[0], $.low_round[1]));

$.deriv_exact = pFuncDriv($.angle);
$.k = ceil(log2(abs($.deriv_exact)));
if ($.deriv_exact < 0) then $.k = -$.k;

$.sigma = 2.0^$.k;
$.deriv = pT.kRound($.deriv_exact - $.sigma);
r = r @ [|$.deriv, $.sigma, $.high, $.low|];
};
return ToStringCArray(r, pT.kCSFX, 4);
};

Append(
"template <typename T> constexpr char kSinApproxTable[] = {};",
"template <> constexpr float kSinApproxTable<float>[] = ",
ApproxLut4_(Float32, sin(x), cos(x)),
"",
"template <> constexpr double kSinApproxTable<double>[] = ",
ApproxLut4_(Float64, sin(x), cos(x)),
""
);
Append(
"template <typename T> constexpr char kCosApproxTable[] = {};",
"template <> constexpr float kCosApproxTable<float>[] = ",
ApproxLut4_(Float32, cos(x), -sin(x)),
"",
"template <> constexpr double kCosApproxTable<double>[] = ",
ApproxLut4_(Float64, cos(x), -sin(x)),
""
);

WriteCPPHeader("npsr::trig::data");
Loading
Loading