diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index fa47cb7..3f6c8ee 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -27,7 +27,7 @@ jobs: - name: Architecture check run: cargo run arch-check - name: Architecture check (Optimized) - run: cargo run --features=optimize_crc32_auto arch-check + run: cargo run arch-check - if: ${{ matrix.rust-toolchain != 'nightly' }} name: Format run: cargo fmt -- --check @@ -36,8 +36,6 @@ jobs: run: cargo clippy - name: Test run: cargo test - - name: Test (Optimized) - run: cargo test --features=optimize_crc32_auto test-x86: name: Test accelerated (x86) @@ -61,8 +59,6 @@ jobs: run: cross check --target ${{ matrix.target }} - name: Test run: cross test --target ${{ matrix.target }} - - name: Test (Optimized) - run: cross test --features=optimize_crc32_auto --target ${{ matrix.target }} test-software: name: Test software fallback @@ -85,6 +81,4 @@ jobs: - name: Check run: cross check --target ${{ matrix.target }} - name: Test - run: cross test --target ${{ matrix.target }} - - name: Test (Optimized) - run: cross test --features=optimize_crc32_auto --target ${{ matrix.target }} \ No newline at end of file + run: cross test --target ${{ matrix.target }} \ No newline at end of file diff --git a/Cargo.lock b/Cargo.lock index 4c7e1b9..ce0fa22 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -82,7 +82,7 @@ dependencies = [ "bitflags", "cexpr", "clang-sys", - "itertools 0.13.0", + "itertools", "log", "prettyplease", "proc-macro2", @@ -139,17 +139,6 @@ dependencies = [ "toml", ] -[[package]] -name = "cc" -version = "1.2.22" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32db95edf998450acc7881c932f94cd9b05c87b4b2599e8bab064753da4acfd1" -dependencies = [ - "jobserver", - "libc", - "shlex", -] - [[package]] name = "cexpr" version = "0.6.0" @@ -257,7 +246,6 @@ version = "1.2.2" dependencies = [ "bindgen", "cbindgen", - "cc", "crc", "criterion", "digest", @@ -278,7 +266,7 @@ dependencies = [ "clap", "criterion-plot", "is-terminal", - "itertools 0.10.5", + "itertools", "num-traits", "once_cell", "oorandom", @@ -299,7 +287,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1" dependencies = [ "cast", - "itertools 0.10.5", + "itertools", ] [[package]] @@ -473,31 +461,12 @@ dependencies = [ "either", ] -[[package]] -name = "itertools" -version = "0.13.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "413ee7dfc52ee1a4949ceeb7dbc8a33f2d6c088194d9f922fb8318faf1f01186" -dependencies = [ - "either", -] - [[package]] name = "itoa" version = "1.0.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" -[[package]] -name = "jobserver" -version = "0.1.33" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38f262f097c174adebe41eb73d66ae9c06b2844fb0da69969647bbddd9b0538a" -dependencies = [ - "getrandom", - "libc", -] - [[package]] name = "js-sys" version = "0.3.77" @@ -516,9 +485,9 @@ checksum = "d750af042f7ef4f724306de029d18836c26c1765a54a6a3f094cbd23a7267ffa" [[package]] name = "libloading" -version = "0.8.6" +version = "0.8.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fc2f4eb4bc735547cfed7c0a4922cbd04a4655978c09b54f1f7b228750664c34" +checksum = "07033963ba89ebaf1584d767badaa2e8fcec21aedea6b8c0346d487d49c28667" dependencies = [ "cfg-if", "windows-targets", @@ -618,9 +587,9 @@ dependencies = [ [[package]] name = "prettyplease" -version = "0.2.32" +version = "0.2.33" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "664ec5419c51e34154eec046ebcba56312d5a2fc3b09a06da188e1ad21afadf6" +checksum = "9dee91521343f4c5c6a63edd65e54f31f5c92fe8978c40a4282f8372194c6a7d" dependencies = [ "proc-macro2", "syn", diff --git a/Cargo.toml b/Cargo.toml index 15a261f..fba702e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -30,9 +30,6 @@ criterion = "0.5" cbindgen = "0.28" bindgen = "0.70" # 0.70 is the last version that supports Rust 1.81 due to 'unsafe extern' blocks -[build-dependencies] -cc = { version = "1.2", features = ["parallel"] } - # lto=true has a big improvement in performance [profile.release] lto = true @@ -47,36 +44,15 @@ harness = false [features] alloc = [] -# enable VPCLMULQDQ support in Rust for x86_64 using nightly toolchain builds +# enable experimental VPCLMULQDQ support, which landed in Rust 1.89.0-nightly, will deprecate after 1.89.0 is stable vpclmulqdq = [] -# enable AVX512 support in Rust for x86_64 using nightly toolchain builds -avx512 = [] - -# enable using fast-crc32 optimized C implementations for CRC-32/ISCSI and CRC-32/ISO-HDLC, automatically detected -optimize_crc32_auto = [] - -# the following features enable forcing custom optimized build features (rather than "auto" which attemps to pick the -# best) for CRC-32/ISCSI and CRC-32/ISO-HDLC calculations, since architecture support and performance varies - -# aarch64 NEON options -optimize_crc32_neon_eor3_v9s3x2e_s3 = [] -optimize_crc32_neon_v12e_v1 = [] -optimize_crc32_neon_v3s4x2e_v2 = [] - -# blends eor3_v9s3x2e_s3 for "large" (>1KiB) payloads, and v12e_v1 for "small" ones, which tends to yield the best -# results on modern aarch64 such as Graviton and Apple Silicon -optimize_crc32_neon_blended = [] - -# x86 SSE+ options -# this will blend automagically for CRC-32/ISO-HDLC which tends to have poor hardware support, but typically great -# support for CRC-32/ISCSI -optimize_crc32_avx512_vpclmulqdq_v3x2 = [] - -# non-blended alternatives -optimize_crc32_avx512_v4s3x3 = [] -optimize_crc32_sse_v4s3x3 = [] - -[lints.rust] -# build-time feature enablement -unexpected_cfgs = { level = "warn", check-cfg = ['cfg(optimized_crc32_iscsi)','cfg(optimized_crc32_iso_hdlc)' ] } +# the features below aren't in use, are deprecated, and will be removed in the next MAJOR version +optimize_crc32_auto = [] # deprecated +optimize_crc32_neon_eor3_v9s3x2e_s3 = [] # deprecated +optimize_crc32_neon_v12e_v1 = [] # deprecated +optimize_crc32_neon_v3s4x2e_v2 = [] # deprecated +optimize_crc32_neon_blended = [] # deprecated +optimize_crc32_avx512_vpclmulqdq_v3x2 = [] # deprecated +optimize_crc32_avx512_v4s3x3 = [] # deprecated +optimize_crc32_sse_v4s3x3 = [] # deprecated \ No newline at end of file diff --git a/README.md b/README.md index dee097f..6322a29 100644 --- a/README.md +++ b/README.md @@ -5,19 +5,19 @@ [![Latest Version](https://img.shields.io/crates/v/crc-fast.svg)](https://crates.io/crates/crc-fast) [![Documentation](https://img.shields.io/badge/api-rustdoc-blue.svg)](https://docs.rs/crc-fast) -Fast, hardware-accelerated CRC calculation for +Fast, hardware-accelerated CRC calculation for [all known CRC-32 and CRC-64 variants](https://reveng.sourceforge.io/crc-catalogue/all.htm) using SIMD intrinsics, -which can exceed _100GiB/s_ for `CRC-32`, and _50GiB/s_ for `CRC-64`, on modern systems. +which can exceed [100GiB/s](#performance) on modern systems. -Supports acceleration on `aarch64`, `x86_64`, and `x86` architectures, plus has a safe non-accelerated software -fallback for other architectures. +Supports acceleration on `aarch64`, `x86_64`, and `x86` architectures, plus has a safe non-accelerated table-based +software fallback for others. -The [crc crate](https://crates.io/crates/crc) is ~0.5GiB/s by default, so this is -[up to >200X faster](#tldr-just-tell-me-how-to-turn-it-up-to-11-), and even the most conservative baseline settings +The [crc crate](https://crates.io/crates/crc) is ~0.5GiB/s by default, so this is +[up to >220X faster](#tldr-just-tell-me-how-to-turn-it-up-to-11-), and even the most conservative baseline settings are >27X. -This is unique, not just because of the performance, but also because I couldn't find a single generic SIMD-accelerated -implementation (in any language) which worked for _all_ known variants, using the +This is unique, not just because of the performance, but also because I couldn't find a single generic SIMD-accelerated +implementation (in any language) which worked for _all_ known variants, using the [Rocksoft model](http://www.ross.net/crc/download/crc_v3.txt), especially the "non-reflected" variants. So I wrote one. @@ -26,6 +26,12 @@ So I wrote one. Supplies a [C/C++ compatible shared library](#cc-compatible-shared-library) for use with other non-`Rust` languages. +## Implementations + +* [AWS SDK for Rust](https://awslabs.github.io/aws-sdk-rust/) via + the [aws-smithy-checksums](https://crates.io/crates/aws-smithy-checksums) crate. +* [crc-fast-php-ext](https://github.com/awesomized/crc-fast-php-ext) `PHP` extension using this library. + ## Changes See [CHANGELOG](CHANGELOG.md). @@ -33,10 +39,11 @@ See [CHANGELOG](CHANGELOG.md). ## Build & Install `cargo build` will obviously build the library, including -the [C-compatible shared library](#c-compatible-shared-library). There are fine-tuning [feature flags](Cargo.toml) +the [C-compatible shared library](#c-compatible-shared-library). There are fine-tuning [feature flags](Cargo.toml) available, should they be necessary for your deployment and [acceleration](#acceleration-targets) targets. -A _very_ basic [Makefile](Makefile) is supplied which supports `make install` to install the shared library and header file to +A _very_ basic [Makefile](Makefile) is supplied which supports `make install` to install the shared library and header +file to the local system. Specifying the `DESTDIR` environment variable will allow you to customize the install location. ``` @@ -47,16 +54,15 @@ You'll need to adjust if you want to optimize with [feature flags](Cargo.toml). ## Usage -Add `crc-fast = { version = "1.1", features = ["optimize_crc32_auto"] }` to your `Cargo.toml` dependencies, which will -enable every available optimization for the `stable` toolchain. Adjust as necessary for your desired -[acceleration targets](#acceleration-targets). +Add `crc-fast = version = "1.3"` to your `Cargo.toml` dependencies, which will enable every available optimization for +the `stable` toolchain. Adjust as necessary for your desired [acceleration targets](#acceleration-targets). ### Digest Implements the [digest::DynDigest](https://docs.rs/digest/latest/digest/trait.DynDigest.html) trait for easier integration with existing Rust code. -Creates a `Digest` which can be updated over time, for stream processing, intermittent workloads, etc, enabling +Creates a `Digest` which can be updated over time, for stream processing, intermittent workloads, etc, enabling finalizing the checksum once processing is complete. ```rust @@ -87,7 +93,7 @@ let file_on_disk = binding.to_str().unwrap(); // actual usage let mut digest = Digest::new(Crc32IsoHdlc); let mut file = File::open(file_on_disk).unwrap(); -std::io::copy(&mut file, &mut digest).unwrap(); +std::io::copy( & mut file, & mut digest).unwrap(); let checksum = digest.finalize(); assert_eq!(checksum, 0xcbf43926); @@ -138,17 +144,17 @@ assert_eq!(checksum.unwrap(), 0xcbf43926); ## C/C++ compatible shared library -`cargo build` will produce a shared library target (`.so` on Linux, `.dll` on Windows, `.dylib` on macOS, etc) and an -auto-generated [libcrc_fast.h](libcrc_fast.h) header file for use in non-Rust projects, such as through +`cargo build` will produce a shared library target (`.so` on Linux, `.dll` on Windows, `.dylib` on macOS, etc) and an +auto-generated [libcrc_fast.h](libcrc_fast.h) header file for use in non-Rust projects, such as through [FFI](https://en.wikipedia.org/wiki/Foreign_function_interface). There is a [crc-fast PHP extension](https://github.com/awesomized/crc-fast-php-ext) using it, for example. ## Background -This implementation is based on Intel's +This implementation is based on Intel's [Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction](https://web.archive.org/web/20131224125630/https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf) -white paper, though it folds 8-at-a-time, like other modern implementations, rather than the 4-at-a-time as in Intel's +white paper, though it folds 8-at-a-time, like other modern implementations, rather than the 4-at-a-time as in Intel's paper. This library works on `aarch64`, `x86_64`, and `x86` architectures, and is hardware-accelerated and optimized for each @@ -157,7 +163,7 @@ architecture. Inspired by [`crc32fast`](https://crates.io/crates/crc32fast), [`crc64fast`](https://crates.io/crates/crc64fast), and [`crc64fast-nvme`](https://crates.io/crates/crc64fast-nvme), each of which only accelerates a single, different CRC -variant, and all of them were "reflected" variants. +variant, and all of them were "reflected" variants. In contrast, this library accelerates _every known variant_ (and should accelerate any future variants without changes), including all the "non-reflected" variants. @@ -169,185 +175,212 @@ stand out as being the most important and widely used (all of which are "reflect ### [CRC-32/ISCSI](https://reveng.sourceforge.io/crc-catalogue/all.htm#crc.cat.crc-32-iscsi) -Many, but not all, implementations simply call this `crc32c` and it's probably the 2nd most popular and widely used, +Many, but not all, implementations simply call this `crc32c` and it's probably the 2nd most popular and widely used, after `CRC-32/ISO-HDLC`. It's used in `iSCSI`, `ext4`, `btrfs`, etc. +Both `x86_64` and `aarch64` have native hardware support for this CRC variant, so we can use +[fusion](https://www.corsix.org/content/fast-crc32c-4k) in many cases to accelerate it further by fusing SIMD CLMUL +instructions with the native CRC instructions. + ### [CRC-32/ISO-HDLC](https://reveng.sourceforge.io/crc-catalogue/all.htm#crc.cat.crc-32-iso-hdlc) Many, but not all, implementations simply call this `crc32` and it may be the most popular and widely used. It's used in `Ethernet`, `PKZIP`, `xz`, etc. +Only `aarch64` has native hardware support for this CRC variant, so we can use +[fusion](https://www.corsix.org/content/fast-crc32c-4k) on that platform, but not `x86_64`. + ### [CRC-64/NVME](https://reveng.sourceforge.io/crc-catalogue/all.htm#crc.cat.crc-64-nvme) -`CRC-64/NVME` comes from the [NVM Express® NVM Command Set Specification](https://nvmexpress.org/wp-content/uploads/NVM-Express-NVM-Command-Set-Specification-1.0d-2023.12.28-Ratified.pdf) +`CRC-64/NVME` comes from +the [NVM Express® NVM Command Set Specification](https://nvmexpress.org/wp-content/uploads/NVM-Express-NVM-Command-Set-Specification-1.0d-2023.12.28-Ratified.pdf) (Revision 1.0d, December 2023), is [AWS S3's recommended checksum option](https://docs.aws.amazon.com/AmazonS3/latest/userguide/checking-object-integrity.html) -(as `CRC64-NVME`), and has also been implemented in the +(as `CRC64-NVME`), and has also been implemented in the [Linux kernel](https://github.com/torvalds/linux/blob/786c8248dbd33a5a7a07f7c6e55a7bfc68d2ca48/lib/crc64.c#L66-L73) -(where it's called `CRC-64/Rocksoft`). +(where it's been called `CRC-64/Rocksoft` in the past). Note that the `Check` value in the `NVMe` spec uses incorrect endianness (see `Section 5.2.1.3.4, Figure 120, page 83`) but all known public & private implementations agree on the correct value, which this library produces. # Acceleration targets -This library has baseline support for accelerating all known `CRC-32` and `CRC-64` variants on `aarch64`, `x86_64`, and -`x86` internally in pure `Rust`. It's extremely fast (up to dozens of GiB/s). This is the default if no feature flags are -specified. - -With feature flags, it can be even faster. 😎 +This library has baseline support for accelerating all known `CRC-32` and `CRC-64` variants on `aarch64`, `x86_64`, and +`x86` internally in pure `Rust`. It's extremely fast (up to dozens of GiB/s) by default if no feature flags are +used. ### tl;dr: Just tell me how to turn it up to 11! 🤘 -For modern `x86_64` systems (requires `nightly` toolchain) which further accelerates _all_ variants, especially -`CRC-32/ISCSI` and `CRC-32/ISO-HDLC`: -``` -rustup toolchain install nightly -cargo +nightly build --release --features=optimize_crc32_auto,vpclmulqdq -``` +For `aarch64` and older `x86_64` systems, the release build will use the best available acceleration: -For `aarch64`, and older `x86_64` / `x86`, systems (no `nightly` required) which further accelerates `CRC-32/ISCSI` and -`CRC-32/ISO-HDLC`: ``` -cargo build --release --features=optimize_crc32_auto +cargo build --release ``` -At [Awesome](https://awesome.co/), we use these 👆 at large scale in production at [Flickr](https://flickr.com/) and -[SmugMug](https://www.smugmug.com/). - -### CRC-32/ISO-HDLC and CRC-32/ISCSI optimization +For modern `x86_64` systems, you can enable [experimental VPCLMULQDQ support](#experimental-vpclmulqdq-support-in-rust) +for a ~2X performance boost. -By using the `optimize_crc32_auto` feature flag, the library will use -[fast-crc32](https://github.com/corsix/fast-crc32/) instead to accelerate _only_ `CRC-32/ISO-HDLC` and/or `CRC-32/ISCSI` -using a [fusion](https://www.corsix.org/content/fast-crc32c-4k) of hardware `crc32(c)` support and `PCLMULQDQ`. - -`fast-crc32` does not accelerate any other `CRC-32` variants, or any `CRC-64` variants, since none of the others have -native hardware-acceleration support in any CPUs which would enable `fusion`. +At [Awesome](https://awesome.co/), we use these 👆 at large scale in production at [Flickr](https://flickr.com/) and +[SmugMug](https://www.smugmug.com/). -`fast-crc32` will use `VPCLMULQDQ` if available, without requiring the need for `nightly` builds (since it's an external -C implementation). +### Checking your platform capabilities There's an [arch-check](src/bin/arch-check.rs) binary which will explain the selected target architecture. ``` // test it works on your system (patches welcome!) -cargo test --features=optimize_crc32_auto +cargo test // examine the chosen acceleration targets -cargo run --features=optimize_crc32_auto arch-check +cargo run arch-check // build for release -cargo build --features=optimize_crc32_auto --release +cargo build --release ``` -There are additional [feature flags](Cargo.toml) to force certain implementations for fine-tuning, benchmarking, etc. - ### Experimental VPCLMULQDQ support in Rust -This library also supports [VPCLMULQDQ](https://en.wikichip.org/wiki/x86/vpclmulqdq) for accelerating all -`CRC-32` and `CRC-64` variants on modern `x86_64` platforms which support it when using `nightly` builds and the -`vpclmulqdq` feature flag. +This library also supports [VPCLMULQDQ](https://en.wikichip.org/wiki/x86/vpclmulqdq) for accelerating all `CRC-32` and +`CRC-64` variants on modern `x86_64` +platforms which support it when using `nightly` builds and the `vpclmulqdq` feature flag. -Typical performance boosts are ~2X, and they apply to CPUs beginning with Intel -[Ice Lake](https://en.wikipedia.org/wiki/Ice_Lake_%28microprocessor%29) (Sep 2019) and AMD -[Zen4](https://en.wikipedia.org/wiki/Zen_4) (Sep 2022). +Typical performance boosts are ~2X, and they apply to CPUs beginning with Intel +[Ice Lake](https://en.wikipedia.org/wiki/Ice_Lake_%28microprocessor%29) (Sep 2019) and +AMD [Zen4](https://en.wikipedia.org/wiki/Zen_4) (Sep 2022). ``` rustup toolchain install nightly cargo +nightly build --release --features=vpclmulqdq ``` -There's a [tracking issue](https://github.com/rust-lang/rust/issues/111137) for when these features might land on -`stable`, which looks like [very soon](https://github.com/rust-lang/rust/issues/111137#issuecomment-2787196977), at -which point this library will adopt it as a default. +`AVX512` support with `VPCLMULQDQ` is stabilized on [1.89.0](https://releases.rs/docs/1.89.0/), so once that becomes +stable in August 2025, this library will be updated to use it by default without needing the `nightly` toolchain. ## Performance -Modern systems can exceed 100GiB/s for calculating `CRC-32/ISCSI` and `CRC-32/ISO-HDLC`, and 50GiB/s for calculating -`CRC-64/NVME`. +Modern systems can exceed 100 GiB/s for calculating `CRC-32/ISCSI`, `CRC-32/ISO-HDLC`, +`CRC-64/NVME`, and all other reflected variants. (Forward variants are slower, due to the extra shuffle-masking, but +are still extremely fast in this library). -This is a summary of the best [targets](#acceleration-targets) for the most important and popular CRC checksums. More -extensive benchmark results, with other targets and variants, can be found in the [benches](benches/README.md) folder. +This is a summary of the best [targets](#acceleration-targets) for the most important and popular CRC checksums. -### CRC-32/ISCSI +### CRC-32/ISCSI (reflected) AKA `crc32c` in many, but not all, implementations. -| Arch | Brand | CPU | System | Target | 1KiB (GiB/s) | 1MiB (GiB/s) | -|:--------|:------|:----------------|:--------------------------|:-----------------------|-------------:|-------------:| -| x86_64 | Intel | Sapphire Rapids | EC2 c7i.metal-48xl | avx512_vpclmulqdq_v3x2 | ~38.0 | ~111.7 | -| x86_64 | AMD | Genoa | EC2 c7a.metal-48xl | avx512_vpclmulqdq_v3x2 | ~21.1 | ~54.6 | -| aarch64 | AWS | Graviton4 | EC2 c8g.metal-48xl | neon_blended | ~18.5 | ~31.6 | -| aarch64 | Apple | M3 Ultra | Mac Studio (32 core) | neon_v12e_v1 | ~54.8 | ~99.6 | -| aarch64 | Apple | M4 Max | MacBook Pro 16" (16 core) | neon_blended | ~60.8 | ~96.3 | -| aarch64 | Apple | M2 Ultra | Mac Studio (24 core) | neon_blended | ~50.3 | ~87.6 | +| Arch | Brand | CPU | System | Target | 1KiB (GiB/s) | 1MiB (GiB/s) | +|:--------|:------|:----------------|:--------------------------|:--------------------|-------------:|-------------:| +| x86_64 | Intel | Sapphire Rapids | EC2 c7i.metal-24xl | avx512-vpclmulqdq* | ~49 | ~111 | +| x86_64 | Intel | Sapphire Rapids | EC2 c7i.metal-24xl | sse-pclmulqdq | ~18 | ~52 | +| x86_64 | AMD | Genoa | EC2 c7a.metal-48xl | avx512-vpclmulqdq* | ~23 | ~54 | +| x86_64 | AMD | Genoa | EC2 c7a.metal-48xl | sse-pclmulqdq | ~11 | ~20 | +| aarch64 | AWS | Graviton4 | EC2 c8g.metal-48xl | neon-eor3-pclmulqdq | ~19 | ~39 | +| aarch64 | AWS | Graviton2 | EC2 c6g.metal | neon-pclmulqdq | ~10 | ~17 | +| aarch64 | Apple | M3 Ultra | Mac Studio (32 core) | neon-eor3-pclmulqdq | ~49 | ~99 | +| aarch64 | Apple | M4 Max | MacBook Pro 16" (16 core) | neon-eor3-pclmulqdq | ~56 | ~94 | -### CRC-32/ISO-HDLC +### CRC-32/ISO-HDLC (reflected) AKA `crc32` in many, but not all, implementations. -| Arch | Brand | CPU | System | Target | 1KiB (GiB/s) | 1MiB (GiB/s) | -|:--------|:------|:----------------|:--------------------------|:-------------|-------------:|-------------:| -| x86_64 | Intel | Sapphire Rapids | EC2 c7i.metal-48xl | avx2_blended | ~16.6 | ~110.4 | -| x86_64 | AMD | Genoa | EC2 c7a.metal-48xl | avx2_blended | ~17.2 | ~53.8 | -| aarch64 | AWS | Graviton4 | EC2 c8g.metal-48xl | neon_blended | ~18.5 | ~31.5 | -| aarch64 | Apple | M3 Ultra | Mac Studio (32 core) | neon_v12e_v1 | ~56.5 | ~98.8 | -| aarch64 | Apple | M4 Max | MacBook Pro 16" (16 core) | neon_v12e_v1 | ~59.2 | ~105.3 | -| aarch64 | Apple | M2 Ultra | Mac Studio (24 core) | neon_blended | ~50.1 | ~87.0 | - -### CRC-64/NVME - -| Arch | Brand | CPU | System | Target | 1KiB (GiB/s) | 1MiB (GiB/s) | -|:--------|:------|:----------------|:---------------------|:--------------------|-------------:|-------------:| -| x86_64 | Intel | Sapphire Rapids | EC2 c7i.metal-24xl | avx512_vpclmulqdq | ~24.9 | ~109.7 | -| x86_64 | AMD | Genoa | EC2 c7a.metal-48xl | avx512_vpclmulqdq | ~24.4 | ~54.6 | -| aarch64 | AWS | Graviton4 | EC2 c8g.metal-48xl | neon_pclmulqdq_eor3 | ~18.7 | ~36.8 | -| aarch64 | AWS | Graviton2 | EC2 c6g.metal | neon_pclmulqdq | ~9.8 | ~15.9 | -| aarch64 | Apple | M3 Ultra | Mac Studio (32 core) | neon_pclmulqdq_eor3 | ~49.5 | ~71.9 | +| Arch | Brand | CPU | System | Target | 1KiB (GiB/s) | 1MiB (GiB/s) | +|:--------|:------|:----------------|:--------------------------|:--------------------|-------------:|-------------:| +| x86_64 | Intel | Sapphire Rapids | EC2 c7i.metal-248xl | avx512-vpclmulqdq* | ~24 | ~110 | +| x86_64 | Intel | Sapphire Rapids | EC2 c7i.metal-248xl | sse-pclmulqdq | ~21 | ~28 | +| x86_64 | AMD | Genoa | EC2 c7a.metal-48xl | avx512-vpclmulqdq* | ~24 | ~55 | +| x86_64 | AMD | Genoa | EC2 c7a.metal-48xl | sse-pclmulqdq | ~12 | ~14 | +| aarch64 | AWS | Graviton4 | EC2 c8g.metal-48xl | neon-eor3-pclmulqdq | ~19 | ~39 | +| aarch64 | AWS | Graviton2 | EC2 c6g.metal | neon-pclmulqdq | ~10 | ~17 | +| aarch64 | Apple | M3 Ultra | Mac Studio (32 core) | neon-eor3-pclmulqdq | ~48 | ~98 | +| aarch64 | Apple | M4 Max | MacBook Pro 16" (16 core) | neon-eor3-pclmulqdq | ~56 | ~94 | + +### CRC-64/NVME (reflected) + +[AWS S3's recommended checksum option](https://docs.aws.amazon.com/AmazonS3/latest/userguide/checking-object-integrity.html) + +| Arch | Brand | CPU | System | Target | 1KiB (GiB/s) | 1MiB (GiB/s) | +|:--------|:------|:----------------|:--------------------------|:--------------------|-------------:|-------------:| +| x86_64 | Intel | Sapphire Rapids | EC2 c7i.metal-24xl | avx512-vpclmulqdq* | ~25 | ~110 | +| x86_64 | Intel | Sapphire Rapids | EC2 c7i.metal-24xl | sse-pclmulqdq | ~21 | ~28 | +| x86_64 | AMD | Genoa | EC2 c7a.metal-48xl | avx512-vpclmulqdq* | ~25 | ~55 | +| x86_64 | AMD | Genoa | EC2 c7a.metal-48xl | sse-pclmulqdq | ~11 | ~14 | +| aarch64 | AWS | Graviton4 | EC2 c8g.metal-48xl | neon-eor3-pclmulqdq | ~20 | ~37 | +| aarch64 | AWS | Graviton2 | EC2 c6g.metal | neon-pclmulqdq | ~10 | ~16 | +| aarch64 | Apple | M3 Ultra | Mac Studio (32 core) | neon-eor3-pclmulqdq | ~50 | ~72 | +| aarch64 | Apple | M4 Max | MacBook Pro 16" (16 core) | neon-eor3-pclmulqdq | ~52 | ~72 | + +### CRC-32/BZIP2 (forward) + +| Arch | Brand | CPU | System | Target | 1KiB (GiB/s) | 1MiB (GiB/s) | +|:--------|:------|:----------------|:--------------------------|:--------------------|-------------:|-------------:| +| x86_64 | Intel | Sapphire Rapids | EC2 c7i.metal-24xl | avx512-vpclmulqdq* | ~23 | ~56 | +| x86_64 | Intel | Sapphire Rapids | EC2 c7i.metal-24xl | sse-pclmulqdq | ~19 | ~28 | +| x86_64 | AMD | Genoa | EC2 c7a.metal-48xl | avx512-vpclmulqdq* | ~21 | ~43 | +| x86_64 | AMD | Genoa | EC2 c7a.metal-48xl | sse-pclmulqdq | ~11 | ~13 | +| aarch64 | AWS | Graviton4 | EC2 c8g.metal-48xl | neon-eor3-pclmulqdq | ~16 | ~32 | +| aarch64 | AWS | Graviton2 | EC2 c6g.metal | neon-pclmulqdq | ~9 | ~14 | +| aarch64 | Apple | M3 Ultra | Mac Studio (32 core) | neon-eor3-pclmulqdq | ~41 | ~59 | +| aarch64 | Apple | M4 Max | MacBook Pro 16" (16 core) | neon-eor3-pclmulqdq | ~47 | ~64 | + +### CRC-64/ECMA-182 (forward) + +| Arch | Brand | CPU | System | Target | 1KiB (GiB/s) | 1MiB (GiB/s) | +|:--------|:------|:----------------|:--------------------------|:--------------------|-------------:|-------------:| +| x86_64 | Intel | Sapphire Rapids | EC2 c7i.metal-24xl | avx512-vpclmulqdq* | ~24 | ~56 | +| x86_64 | Intel | Sapphire Rapids | EC2 c7i.metal-24xl | sse-pclmulqdq | ~19 | ~28 | +| x86_64 | AMD | Genoa | EC2 c7a.metal-48xl | avx512-vpclmulqdq* | ~21 | ~43 | +| x86_64 | AMD | Genoa | EC2 c7a.metal-48xl | sse-pclmulqdq | ~11 | ~13 | +| aarch64 | AWS | Graviton4 | EC2 c8g.metal-48xl | neon-eor3-pclmulqdq | ~18 | ~31 | +| aarch64 | AWS | Graviton2 | EC2 c6g.metal | neon-pclmulqdq | ~9 | ~14 | +| aarch64 | Apple | M3 Ultra | Mac Studio (32 core) | neon-eor3-pclmulqdq | ~40 | ~59 | +| aarch64 | Apple | M4 Max | MacBook Pro 16" (16 core) | neon-eor3-pclmulqdq | ~46 | ~61 | + +\* = [Experimental VPCLMULQDQ support in Rust](#experimental-vpclmulqdq-support-in-rust) is enabled. ## Other CRC widths -There are [a lot of other known CRC widths and variants](https://reveng.sourceforge.io/crc-catalogue/all.htm), ranging -from `CRC-3/GSM` to `CRC-82/DARC`, and everything in between. +There are [a lot of other known CRC widths and variants](https://reveng.sourceforge.io/crc-catalogue/all.htm), ranging +from `CRC-3/GSM` to `CRC-82/DARC`, and everything in between. -Since [Awesome](https://awesome.co) doesn't use any that aren't `CRC-32` or `CRC-64` in length, this library doesn't -currently support them, either. (It should support any newly created or discovered `CRC-32` and `CRC-64` variants, +Since [Awesome](https://awesome.co) doesn't use any that aren't `CRC-32` or `CRC-64` in length, this library doesn't +currently support them, either. (It should support any newly created or discovered `CRC-32` and `CRC-64` variants, though, with zero changes other than defining the [Rocksoft](http://www.ross.net/crc/download/crc_v3.txt) parameters). In theory, much of the "heavy lifting" has been done, so it should be possible to add other widths with minimal effort. PRs welcome! -## Implementations -* [crc-fast-php-ext](https://github.com/awesomized/crc-fast-php-ext) `PHP` extension using this library. - ## References * [crc32-fast](https://crates.io/crates/crc32fast) Original `CRC-32/ISO-HDLC` (`crc32`) implementation in `Rust`. * [crc64-fast](https://github.com/tikv/crc64fast) Original `CRC-64/XZ` implementation in `Rust`. * [crc64fast-nvme](https://github.com/awesomized/crc64fast-nvme) Original `CRC-64/NVME` implementation in `Rust`. -* [Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction](https://web.archive.org/web/20131224125630/https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf) +* [Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction](https://web.archive.org/web/20131224125630/https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf) Intel's paper. -* [NVM Express® NVM Command Set Specification](https://nvmexpress.org/wp-content/uploads/NVM-Express-NVM-Command-Set-Specification-1.0d-2023.12.28-Ratified.pdf) +* [NVM Express® NVM Command Set Specification](https://nvmexpress.org/wp-content/uploads/NVM-Express-NVM-Command-Set-Specification-1.0d-2023.12.28-Ratified.pdf) The NVMe spec, including `CRC-64-NVME` (with incorrect endian `Check` value in `Section 5.2.1.3.4, Figure 120, page 83`). * [CRC-64/NVME](https://reveng.sourceforge.io/crc-catalogue/all.htm#crc.cat.crc-64-nvme) The `CRC-64/NVME` quick definition. -* [A PAINLESS GUIDE TO CRC ERROR DETECTION ALGORITHMS](http://www.ross.net/crc/download/crc_v3.txt) Best description of +* [A PAINLESS GUIDE TO CRC ERROR DETECTION ALGORITHMS](http://www.ross.net/crc/download/crc_v3.txt) Best description of CRC I've seen to date (and the definition of the Rocksoft model). * [Linux implementation](https://github.com/torvalds/linux/blob/786c8248dbd33a5a7a07f7c6e55a7bfc68d2ca48/lib/crc64.c) Linux implementation of `CRC-64/NVME`. -* [MASM/C++ artifacts implementation](https://github.com/jeffareid/crc/) - Reference MASM/C++ implementation for +* [MASM/C++ artifacts implementation](https://github.com/jeffareid/crc/) - Reference MASM/C++ implementation for generating artifacts. * [Intel isa-l GH issue #88](https://github.com/intel/isa-l/issues/88) - Additional insight into generating artifacts. -* [StackOverflow PCLMULQDQ CRC32 answer](https://stackoverflow.com/questions/71328336/fast-crc-with-pclmulqdq-not-reflected/71329114#71329114) +* [StackOverflow PCLMULQDQ CRC32 answer](https://stackoverflow.com/questions/71328336/fast-crc-with-pclmulqdq-not-reflected/71329114#71329114) Insightful answer to implementation details for CRC32. -* [StackOverflow PCLMULQDQ CRC32 question](https://stackoverflow.com/questions/21171733/calculating-constants-for-crc32-using-pclmulqdq) +* [StackOverflow PCLMULQDQ CRC32 question](https://stackoverflow.com/questions/21171733/calculating-constants-for-crc32-using-pclmulqdq) Insightful question & answer to CRC32 implementation details. * [AWS S3 announcement about CRC64-NVME support](https://aws.amazon.com/blogs/aws/introducing-default-data-integrity-protections-for-new-objects-in-amazon-s3/) * [AWS S3 docs on checking object integrity using CRC64-NVME](https://docs.aws.amazon.com/AmazonS3/latest/userguide/checking-object-integrity.html) * [Vector Carry-Less Multiplication of Quadwords (VPCLMULQDQ) details](https://en.wikichip.org/wiki/x86/vpclmulqdq) * [Linux kernel updates by Eric Biggers to use VPCLMULQDQ, etc](https://lkml.org/lkml/2025/2/10/1367) +* [Faster CRC32-C on x86](https://www.corsix.org/content/fast-crc32c-4k) +* [Faster CRC32 on the Apple M1](https://dougallj.wordpress.com/2022/05/22/faster-crc32-on-the-apple-m1/) +* [An alternative exposition of crc32_4k_pclmulqdq](https://www.corsix.org/content/alternative-exposition-crc32_4k_pclmulqdq) +* [fast-crc32](https://github.com/corsix/fast-crc32) - implementations of fusion for two CRC-32 variants. ## License diff --git a/benches/benchmark.rs b/benches/benchmark.rs index aacbac8..fffb0a4 100644 --- a/benches/benchmark.rs +++ b/benches/benchmark.rs @@ -4,6 +4,7 @@ use crc_fast::checksum; use crc_fast::CrcAlgorithm; use criterion::*; use rand::{rng, RngCore}; +use std::time::Duration; pub const SIZES: &[(&str, i32); 2] = &[ ("1 MiB", 1024 * 1024), @@ -29,11 +30,12 @@ pub const SIZES: &[(&str, i32); 2] = &[ ]; // these are the most important algorithms in popular use, with forward/reflected coverage -pub const CRC32_ALGORITHMS: &[CrcAlgorithm; 4] = &[ - CrcAlgorithm::Crc32Autosar, // reflected, internal - CrcAlgorithm::Crc32Iscsi, // reflected, custom - CrcAlgorithm::Crc32IsoHdlc, // reflected, custom - CrcAlgorithm::Crc32Bzip2, // forward, internal +pub const CRC32_ALGORITHMS: &[CrcAlgorithm; 3] = &[ + // benchmark both CRC-32/ISCSI and CRC-32/ISO-HDLC since they're special flowers with lots of + // different acceleration targets. + CrcAlgorithm::Crc32Iscsi, // reflected + CrcAlgorithm::Crc32IsoHdlc, // reflected + CrcAlgorithm::Crc32Bzip2, // forward ]; // these are the most important algorithms in popular use, with forward/reflected coverage @@ -78,13 +80,9 @@ fn bench_crc32(c: &mut Criterion) { let mut group = c.benchmark_group("CRC-32"); println!( - "CRC-32/ISCSI implementation {}", + "Acceleration target: {}", crc_fast::get_calculator_target(CrcAlgorithm::Crc32Iscsi) ); - println!( - "CRC-32/ISO-HDLC implementation {}", - crc_fast::get_calculator_target(CrcAlgorithm::Crc32IsoHdlc) - ); for (size_name, size) in SIZES { let buf = create_aligned_data(&*random_data(*size)); @@ -101,6 +99,7 @@ fn bench_crc32(c: &mut Criterion) { group.throughput(Throughput::Bytes(*size as u64)); group.sample_size(1000); + group.measurement_time(Duration::from_secs(30)); let bench_name = [alg_suffix.unwrap(), "(checksum)"].join(" "); @@ -128,6 +127,11 @@ fn bench_crc32(c: &mut Criterion) { #[inline(always)] fn bench_crc64(c: &mut Criterion) { + println!( + "Acceleration target: {}", + crc_fast::get_calculator_target(CrcAlgorithm::Crc64Nvme) + ); + let mut group = c.benchmark_group("CRC-64"); for (size_name, size) in SIZES { @@ -145,6 +149,7 @@ fn bench_crc64(c: &mut Criterion) { group.throughput(Throughput::Bytes(*size as u64)); group.sample_size(1000); + group.measurement_time(Duration::from_secs(30)); let bench_name = [alg_suffix.unwrap(), "(checksum)"].join(" "); @@ -170,6 +175,6 @@ fn bench_crc64(c: &mut Criterion) { } } -criterion_group!(benches, bench_crc64, bench_crc32); +criterion_group!(benches, bench_crc32, bench_crc64); criterion_main!(benches); diff --git a/build.rs b/build.rs deleted file mode 100644 index e7b3893..0000000 --- a/build.rs +++ /dev/null @@ -1,230 +0,0 @@ -#![allow(dead_code)] -#![allow(unused)] - -extern crate cc; - -use cc::Build; -use std::env; - -#[cfg(target_arch = "aarch64")] -use std::arch::is_aarch64_feature_detected; - -#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] -use std::arch::is_x86_feature_detected; - -fn main() { - // Windows doesn't build the C bindings automatically, and since they're auto-generated from - // another project, I'm not inclined to fix it. The Rust implementation is still very fast. - #[cfg(target_os = "windows")] - return; - - // build hardware optimized version - build_optimized(); -} - -/// Builds hardware-optimized versions of the CRC32 functions -fn build_optimized() { - // in build scripts, the target architecture is only available via an environment variable - let target_arch = env::var("CARGO_CFG_TARGET_ARCH").unwrap(); - - if "aarch64" == target_arch { - return build_optimized_aarch64(); - } - - if "x86_64" == target_arch || "x86" == target_arch { - build_optimized_x86() - } - - // fall back to Rust implementation -} - -fn build_optimized_target_crc32_iscsi(name: &str, flags: &[String]) { - build_optimized_target(name, flags); - - println!("cargo:rustc-cfg=optimized_crc32_iscsi"); -} - -fn build_optimized_target_crc32_iso_hdlc(name: &str, flags: &[String]) { - build_optimized_target(name, flags); - - println!("cargo:rustc-cfg=optimized_crc32_iso_hdlc"); -} - -fn build_optimized_target(name: &str, flags: &[String]) { - // Create a longer-lived binding as suggested by the error message - let mut binding = Build::new(); - let mut build = binding.file(format!("include/{name}.c")).include("include"); - - // Apply each flag individually - for flag in flags { - build = build.flag(flag); - } - - build.compile(name); -} - -fn build_optimized_aarch64() { - // feature flag overrides to allow forcing a specific implementation - - // NEON EOR3, which seems to be faster for larger payloads, - // but slower for smaller ones than v12e_v1 - #[cfg(feature = "optimize_crc32_neon_eor3_v9s3x2e_s3")] - return build_neon_eor3_v9s3x2e_s3(); - - // NEON w/o EOR3, tuned for Apple M1, which is MUCH faster at smaller payloads, and slightly - // slower at larger ones, on my Apple M2 Ultra - #[cfg(feature = "optimize_crc32_neon_v12e_v1")] - return build_neon_v12e_v1(); - - // NEON w/o EOR3, tuned for Ampere Altra Arm (GCP Tau T2A) - #[cfg(feature = "optimize_crc32_neon_v3s4x2e_v2")] - return build_neon_v3s4x2e_v2(); - - // NEON w/EOR3 for large payloads (>1KiB), NEON w/o EOR3 for small ones - #[cfg(feature = "optimize_crc32_neon_blended")] - return build_neon_blended(); - - // no auto-optimize enabled, return and use the internal Rust implementation - #[cfg(feature = "optimize_crc32_auto")] - { - // for auto, default to NEON blended with EOR3 for large (>1KiB) payloads, w/o EOR3 for - // small ones - #[allow(unreachable_code)] - #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] - if is_aarch64_feature_detected!("crc") && is_aarch64_feature_detected!("sha3") { - return build_neon_blended(); - } - - // for auto, fallback to non-EOR3 if SHA3 is not available - #[allow(unreachable_code)] - #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] - if is_aarch64_feature_detected!("crc") { - build_neon_v12e_v1() - } - } - - // fall through to internal Rust implementation -} - -fn build_neon_blended() { - println!("Building NEON blended"); - - let flags = [String::from("-march=armv8.2-a+crypto+crc+sha3")]; - - build_optimized_target_crc32_iscsi("crc32_iscsi_neon_blended", &flags); - build_optimized_target_crc32_iso_hdlc("crc32_iso_hdlc_neon_blended", &flags); -} - -fn build_neon_eor3_v9s3x2e_s3() { - println!("Building NEON EOR3 v9s3x2e s3"); - - let flags = [String::from("-march=armv8.2-a+crypto+crc+sha3")]; - - build_optimized_target_crc32_iscsi("crc32_iscsi_neon_eor3_v9s3x2e_s3", &flags); - build_optimized_target_crc32_iso_hdlc("crc32_iso_hdlc_neon_eor3_v9s3x2e_s3", &flags); -} - -fn build_neon_v12e_v1() { - println!("Building NEON v12e v1"); - - let flags = [String::from("-march=armv8-a+crypto+crc")]; - - build_optimized_target_crc32_iscsi("crc32_iscsi_neon_v12e_v1", &flags); - build_optimized_target_crc32_iso_hdlc("crc32_iso_hdlc_neon_v12e_v1", &flags); -} - -fn build_neon_v3s4x2e_v2() { - println!("Building NEON v12e v1"); - - let flags = [String::from("-march=armv8-a+crypto+crc")]; - - build_optimized_target_crc32_iscsi("crc32_iscsi_neon_v3s4x2e_v2", &flags); - build_optimized_target_crc32_iso_hdlc("crc32_iso_hdlc_neon_v3s4x2e_v2", &flags); -} - -fn build_optimized_x86() { - // feature flag overrides to allow forcing a specific implementation - - #[cfg(feature = "optimize_crc32_avx512_vpclmulqdq_v3x2")] - return build_avx512_vpclmulqdq_v3x2(); - - #[cfg(feature = "optimize_crc32_avx512_v4s3x3")] - return build_avx512_v4s3x3(); - - #[cfg(feature = "optimize_crc32_sse_v4s3x3")] - return build_sse_v4s3x3(); - - // no auto-optimize enabled, return and use the internal Rust implementation - #[cfg(feature = "optimize_crc32_auto")] - { - // for auto, default to the best available implementation based on CPU features - - // in build scripts, the target architecture is only available via an environment variable - let target_arch = env::var("CARGO_CFG_TARGET_ARCH").unwrap(); - if "x86" == target_arch { - // this is the only one supported on 32-bit x86 systems - crate::build_sse_v4s3x3() - } - - #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] - if is_x86_feature_detected!("vpclmulqdq") - && is_x86_feature_detected!("avx512vl") - && is_x86_feature_detected!("avx512f") - { - return build_avx512_vpclmulqdq_v3x2(); - } - - #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] - if is_x86_feature_detected!("avx512vl") - && is_x86_feature_detected!("avx512f") - && is_x86_feature_detected!("pclmulqdq") - { - return crate::build_avx512_v4s3x3(); - } - - #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] - if is_x86_feature_detected!("sse4.2") && is_x86_feature_detected!("pclmulqdq") { - crate::build_sse_v4s3x3() - } - } - - // fall through to internal Rust implementation -} - -fn build_avx512_vpclmulqdq_v3x2() { - println!("Building AVX512 VPCLMULQDQ v3x2"); - - let flags = [ - String::from("-msse4.2"), - String::from("-mpclmul"), - String::from("-mavx512f"), - String::from("-mavx512vl"), - String::from("-mvpclmulqdq"), - ]; - - build_optimized_target_crc32_iscsi("crc32_iscsi_avx512_vpclmulqdq_v3x2", &flags); - build_optimized_target_crc32_iso_hdlc("crc32_iso_hdlc_avx512_vpclmulqdq_v3x2", &flags); -} - -fn build_avx512_v4s3x3() { - println!("Building AVX512 v4s3x3"); - - let flags = [ - String::from("-msse4.2"), - String::from("-mpclmul"), - String::from("-mavx512f"), - String::from("-mavx512vl"), - ]; - - build_optimized_target_crc32_iscsi("crc32_iscsi_avx512_v4s3x3", &flags); - build_optimized_target_crc32_iso_hdlc("crc32_iso_hdlc_avx512_v4s3x3", &flags); -} - -fn build_sse_v4s3x3() { - println!("Building SSE v4s3x3 for x86 / x86_64"); - - let flags = [String::from("-msse4.2"), String::from("-mpclmul")]; - - build_optimized_target_crc32_iscsi("crc32_iscsi_sse_v4s3x3", &flags); - build_optimized_target_crc32_iso_hdlc("crc32_iso_hdlc_sse_v4s3x3", &flags); -} diff --git a/include/crc32_iscsi.h b/include/crc32_iscsi.h deleted file mode 100644 index 142dc22..0000000 --- a/include/crc32_iscsi.h +++ /dev/null @@ -1,40 +0,0 @@ -/* Generated header for hardware-accelerated CRC-32/ISCSI implementation */ -/* Original implementation from https://github.com/corsix/fast-crc32/ */ -/* MIT licensed */ - -#ifndef CRC32_ISCSI_H -#define CRC32_ISCSI_H - -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -/* - * The target build properties (CPU architecture and fine-tuning parameters) for the compiled implementation. - */ -extern const char *const ISCSI_TARGET; - -/** - * Gets the target build properties (CPU architecture and fine-tuning parameters) for this implementation. - */ -const char *get_iscsi_target(void); - -/** - * Calculate CRC-32/ISCSI checksum using hardware acceleration - * - * @param crc0 Initial CRC value (typically 0) - * @param buf Pointer to input data buffer - * @param len Length of input data in bytes - * - * @return Calculated CRC-32/ISCSI checksum - */ -uint32_t crc32_iscsi_impl(uint32_t crc0, const char* buf, size_t len); - -#ifdef __cplusplus -} -#endif - -#endif /* CRC32_ISCSI_H */ \ No newline at end of file diff --git a/include/crc32_iscsi_avx512_v4s3x3.c b/include/crc32_iscsi_avx512_v4s3x3.c deleted file mode 100644 index ce0b39f..0000000 --- a/include/crc32_iscsi_avx512_v4s3x3.c +++ /dev/null @@ -1,149 +0,0 @@ -/* Generated by https://github.com/corsix/fast-crc32/ using: */ -/* ./generate -i avx512 -p crc32c -a v4s3x3 */ -/* Modified slightly post-generation to improve function name and include build target */ -/* MIT licensed */ - -#include -#include -#include -#include -#include - -#if defined(_MSC_VER) -#define CRC_AINLINE static __forceinline -#define CRC_ALIGN(n) __declspec(align(n)) -#else -#define CRC_AINLINE static __inline __attribute__((always_inline)) -#define CRC_ALIGN(n) __attribute__((aligned(n))) -#endif -#define CRC_EXPORT extern - -#define clmul_lo(a, b) (_mm_clmulepi64_si128((a), (b), 0)) -#define clmul_hi(a, b) (_mm_clmulepi64_si128((a), (b), 17)) - -const char *const ISCSI_TARGET = "x86_64_avx512_v4s3x3"; - -const char *get_iscsi_target() { - return ISCSI_TARGET; -} - -CRC_AINLINE __m128i clmul_scalar(uint32_t a, uint32_t b) { - return _mm_clmulepi64_si128(_mm_cvtsi32_si128(a), _mm_cvtsi32_si128(b), 0); -} - -static uint32_t xnmodp(uint64_t n) /* x^n mod P, in log(n) time */ { - uint64_t stack = ~(uint64_t)1; - uint32_t acc, low; - for (; n > 191; n = (n >> 1) - 16) { - stack = (stack << 1) + (n & 1); - } - stack = ~stack; - acc = ((uint32_t)0x80000000) >> (n & 31); - for (n >>= 5; n; --n) { - acc = _mm_crc32_u32(acc, 0); - } - while ((low = stack & 1), stack >>= 1) { - __m128i x = _mm_cvtsi32_si128(acc); - uint64_t y = _mm_cvtsi128_si64(_mm_clmulepi64_si128(x, x, 0)); - acc = _mm_crc32_u64(0, y << low); - } - return acc; -} - -CRC_AINLINE __m128i crc_shift(uint32_t crc, size_t nbytes) { - return clmul_scalar(crc, xnmodp(nbytes * 8 - 33)); -} - -CRC_EXPORT uint32_t crc32_iscsi_impl(uint32_t crc0, const char* buf, size_t len) { - crc0 = ~crc0; - for (; len && ((uintptr_t)buf & 7); --len) { - crc0 = _mm_crc32_u8(crc0, *buf++); - } - if (((uintptr_t)buf & 8) && len >= 8) { - crc0 = _mm_crc32_u64(crc0, *(const uint64_t*)buf); - buf += 8; - len -= 8; - } - if (len >= 144) { - size_t blk = (len - 8) / 136; - size_t klen = blk * 24; - const char* buf2 = buf + 0; - uint32_t crc1 = 0; - uint32_t crc2 = 0; - __m128i vc0; - __m128i vc1; - uint64_t vc; - /* First vector chunk. */ - __m128i x0 = _mm_loadu_si128((const __m128i*)buf2), y0; - __m128i x1 = _mm_loadu_si128((const __m128i*)(buf2 + 16)), y1; - __m128i x2 = _mm_loadu_si128((const __m128i*)(buf2 + 32)), y2; - __m128i x3 = _mm_loadu_si128((const __m128i*)(buf2 + 48)), y3; - __m128i k; - k = _mm_setr_epi32(0x740eef02, 0, 0x9e4addf8, 0); - x0 = _mm_xor_si128(_mm_cvtsi32_si128(crc0), x0); - crc0 = 0; - buf2 += 64; - len -= 136; - buf += blk * 64; - /* Main loop. */ - while (len >= 144) { - y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k); - y1 = clmul_lo(x1, k), x1 = clmul_hi(x1, k); - y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k); - y3 = clmul_lo(x3, k), x3 = clmul_hi(x3, k); - x0 = _mm_ternarylogic_epi64(x0, y0, _mm_loadu_si128((const __m128i*)buf2), 0x96); - x1 = _mm_ternarylogic_epi64(x1, y1, _mm_loadu_si128((const __m128i*)(buf2 + 16)), 0x96); - x2 = _mm_ternarylogic_epi64(x2, y2, _mm_loadu_si128((const __m128i*)(buf2 + 32)), 0x96); - x3 = _mm_ternarylogic_epi64(x3, y3, _mm_loadu_si128((const __m128i*)(buf2 + 48)), 0x96); - crc0 = _mm_crc32_u64(crc0, *(const uint64_t*)buf); - crc1 = _mm_crc32_u64(crc1, *(const uint64_t*)(buf + klen)); - crc2 = _mm_crc32_u64(crc2, *(const uint64_t*)(buf + klen * 2)); - crc0 = _mm_crc32_u64(crc0, *(const uint64_t*)(buf + 8)); - crc1 = _mm_crc32_u64(crc1, *(const uint64_t*)(buf + klen + 8)); - crc2 = _mm_crc32_u64(crc2, *(const uint64_t*)(buf + klen * 2 + 8)); - crc0 = _mm_crc32_u64(crc0, *(const uint64_t*)(buf + 16)); - crc1 = _mm_crc32_u64(crc1, *(const uint64_t*)(buf + klen + 16)); - crc2 = _mm_crc32_u64(crc2, *(const uint64_t*)(buf + klen * 2 + 16)); - buf += 24; - buf2 += 64; - len -= 136; - } - /* Reduce x0 ... x3 to just x0. */ - k = _mm_setr_epi32(0xf20c0dfe, 0, 0x493c7d27, 0); - y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k); - y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k); - x0 = _mm_ternarylogic_epi64(x0, y0, x1, 0x96); - x2 = _mm_ternarylogic_epi64(x2, y2, x3, 0x96); - k = _mm_setr_epi32(0x3da6d0cb, 0, 0xba4fc28e, 0); - y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k); - x0 = _mm_ternarylogic_epi64(x0, y0, x2, 0x96); - /* Final scalar chunk. */ - crc0 = _mm_crc32_u64(crc0, *(const uint64_t*)buf); - crc1 = _mm_crc32_u64(crc1, *(const uint64_t*)(buf + klen)); - crc2 = _mm_crc32_u64(crc2, *(const uint64_t*)(buf + klen * 2)); - crc0 = _mm_crc32_u64(crc0, *(const uint64_t*)(buf + 8)); - crc1 = _mm_crc32_u64(crc1, *(const uint64_t*)(buf + klen + 8)); - crc2 = _mm_crc32_u64(crc2, *(const uint64_t*)(buf + klen * 2 + 8)); - crc0 = _mm_crc32_u64(crc0, *(const uint64_t*)(buf + 16)); - crc1 = _mm_crc32_u64(crc1, *(const uint64_t*)(buf + klen + 16)); - crc2 = _mm_crc32_u64(crc2, *(const uint64_t*)(buf + klen * 2 + 16)); - buf += 24; - vc0 = crc_shift(crc0, klen * 2 + 8); - vc1 = crc_shift(crc1, klen + 8); - vc = _mm_extract_epi64(_mm_xor_si128(vc0, vc1), 0); - /* Reduce 128 bits to 32 bits, and multiply by x^32. */ - vc ^= _mm_extract_epi64(crc_shift(_mm_crc32_u64(_mm_crc32_u64(0, _mm_extract_epi64(x0, 0)), _mm_extract_epi64(x0, 1)), klen * 3 + 8), 0); - /* Final 8 bytes. */ - buf += klen * 2; - crc0 = crc2; - crc0 = _mm_crc32_u64(crc0, *(const uint64_t*)buf ^ vc), buf += 8; - len -= 8; - } - for (; len >= 8; buf += 8, len -= 8) { - crc0 = _mm_crc32_u64(crc0, *(const uint64_t*)buf); - } - for (; len; --len) { - crc0 = _mm_crc32_u8(crc0, *buf++); - } - return ~crc0; -} diff --git a/include/crc32_iscsi_avx512_vpclmulqdq_v3x2.c b/include/crc32_iscsi_avx512_vpclmulqdq_v3x2.c deleted file mode 100644 index 9ff1d8a..0000000 --- a/include/crc32_iscsi_avx512_vpclmulqdq_v3x2.c +++ /dev/null @@ -1,97 +0,0 @@ -/* Generated by https://github.com/corsix/fast-crc32/ using: */ -/* ./generate -i avx512_vpclmulqdq -p crc32c -a v3x2 */ -/* Modified slightly post-generation to improve function name and include build target */ -/* MIT licensed */ - -#include -#include -#include -#include - -#if defined(_MSC_VER) -#define CRC_AINLINE static __forceinline -#define CRC_ALIGN(n) __declspec(align(n)) -#else -#define CRC_AINLINE static __inline __attribute__((always_inline)) -#define CRC_ALIGN(n) __attribute__((aligned(n))) -#endif -#define CRC_EXPORT extern - -#define clmul_lo(a, b) (_mm512_clmulepi64_epi128((a), (b), 0)) -#define clmul_hi(a, b) (_mm512_clmulepi64_epi128((a), (b), 17)) - -const char *const ISCSI_TARGET = "x86_64_avx512_vpclmulqdq_v3x2"; - -const char *get_iscsi_target() { - return ISCSI_TARGET; -} - -CRC_EXPORT uint32_t crc32_iscsi_impl(uint32_t crc0, const char* buf, size_t len) { - crc0 = ~crc0; - for (; len && ((uintptr_t)buf & 7); --len) { - crc0 = _mm_crc32_u8(crc0, *buf++); - } - while (((uintptr_t)buf & 56) && len >= 8) { - crc0 = _mm_crc32_u64(crc0, *(const uint64_t*)buf); - buf += 8; - len -= 8; - } - if (len >= 384) { - __m128i z0; - /* First vector chunk. */ - __m512i x0 = _mm512_loadu_si512((const void*)buf), y0; - __m512i x1 = _mm512_loadu_si512((const void*)(buf + 64)), y1; - __m512i x2 = _mm512_loadu_si512((const void*)(buf + 128)), y2; - __m512i k; - k = _mm512_broadcast_i32x4(_mm_setr_epi32(0xa87ab8a8, 0, 0xab7aff2a, 0)); - x0 = _mm512_xor_si512(_mm512_castsi128_si512(_mm_cvtsi32_si128(crc0)), x0); - y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k); - y1 = clmul_lo(x1, k), x1 = clmul_hi(x1, k); - y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k); - x0 = _mm512_ternarylogic_epi64(x0, y0, _mm512_loadu_si512((const void*)(buf + 192)), 0x96); - x1 = _mm512_ternarylogic_epi64(x1, y1, _mm512_loadu_si512((const void*)(buf + 256)), 0x96); - x2 = _mm512_ternarylogic_epi64(x2, y2, _mm512_loadu_si512((const void*)(buf + 320)), 0x96); - buf += 384; - len -= 384; - /* Main loop. */ - while (len >= 384) { - y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k); - y1 = clmul_lo(x1, k), x1 = clmul_hi(x1, k); - y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k); - x0 = _mm512_ternarylogic_epi64(x0, y0, _mm512_loadu_si512((const void*)buf), 0x96); - x1 = _mm512_ternarylogic_epi64(x1, y1, _mm512_loadu_si512((const void*)(buf + 64)), 0x96); - x2 = _mm512_ternarylogic_epi64(x2, y2, _mm512_loadu_si512((const void*)(buf + 128)), 0x96); - y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k); - y1 = clmul_lo(x1, k), x1 = clmul_hi(x1, k); - y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k); - x0 = _mm512_ternarylogic_epi64(x0, y0, _mm512_loadu_si512((const void*)(buf + 192)), 0x96); - x1 = _mm512_ternarylogic_epi64(x1, y1, _mm512_loadu_si512((const void*)(buf + 256)), 0x96); - x2 = _mm512_ternarylogic_epi64(x2, y2, _mm512_loadu_si512((const void*)(buf + 320)), 0x96); - buf += 384; - len -= 384; - } - /* Reduce x0 ... x2 to just x0. */ - k = _mm512_broadcast_i32x4(_mm_setr_epi32(0x740eef02, 0, 0x9e4addf8, 0)); - y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k); - x0 = _mm512_ternarylogic_epi64(x0, y0, x1, 0x96); - x1 = x2; - y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k); - x0 = _mm512_ternarylogic_epi64(x0, y0, x1, 0x96); - /* Reduce 512 bits to 128 bits. */ - k = _mm512_setr_epi32(0x1c291d04, 0, 0xddc0152b, 0, 0x3da6d0cb, 0, 0xba4fc28e, 0, 0xf20c0dfe, 0, 0x493c7d27, 0, 0, 0, 0, 0); - y0 = clmul_lo(x0, k), k = clmul_hi(x0, k); - y0 = _mm512_xor_si512(y0, k); - z0 = _mm_ternarylogic_epi64(_mm512_castsi512_si128(y0), _mm512_extracti32x4_epi32(y0, 1), _mm512_extracti32x4_epi32(y0, 2), 0x96); - z0 = _mm_xor_si128(z0, _mm512_extracti32x4_epi32(x0, 3)); - /* Reduce 128 bits to 32 bits, and multiply by x^32. */ - crc0 = _mm_crc32_u64(0, _mm_extract_epi64(z0, 0)); - crc0 = _mm_crc32_u64(crc0, _mm_extract_epi64(z0, 1)); - } - for (; len >= 8; buf += 8, len -= 8) { - crc0 = _mm_crc32_u64(crc0, *(const uint64_t*)buf); - } - for (; len; --len) { - crc0 = _mm_crc32_u8(crc0, *buf++); - } - return ~crc0; -} diff --git a/include/crc32_iscsi_neon_blended.c b/include/crc32_iscsi_neon_blended.c deleted file mode 100644 index 6ae7572..0000000 --- a/include/crc32_iscsi_neon_blended.c +++ /dev/null @@ -1,319 +0,0 @@ -/* Generated by https://github.com/corsix/fast-crc32/ */ -/* Modified post-generation to improve function names, include build targets, - and bifurcate large (>1KiB) and small payloads for optimized performance */ -/* MIT licensed */ - -#include -#include -#include -#include - -#if defined(_MSC_VER) -#define CRC_AINLINE static __forceinline -#define CRC_ALIGN(n) __declspec(align(n)) -#else -#define CRC_AINLINE static __inline __attribute__((always_inline)) -#define CRC_ALIGN(n) __attribute__((aligned(n))) -#endif -#define CRC_EXPORT extern - -const char *const ISCSI_TARGET = "aarch64_neon_blended"; - -const char *get_iscsi_target() { - return ISCSI_TARGET; -} - -CRC_AINLINE uint64x2_t clmul_lo_eor3(uint64x2_t a, uint64x2_t b) { - uint64x2_t r; - __asm("pmull %0.1q, %1.1d, %2.1d\n" : "=w"(r) : "w"(a), "w"(b)); - return r; -} - -CRC_AINLINE uint64x2_t clmul_hi_eor3(uint64x2_t a, uint64x2_t b) { - uint64x2_t r; - __asm("pmull2 %0.1q, %1.2d, %2.2d\n" : "=w"(r) : "w"(a), "w"(b)); - return r; -} - -CRC_AINLINE uint64x2_t clmul_scalar(uint32_t a, uint32_t b) { - uint64x2_t r; - __asm("pmull %0.1q, %1.1d, %2.1d\n" : "=w"(r) : "w"(vmovq_n_u64(a)), "w"(vmovq_n_u64(b))); - return r; -} - -static uint32_t xnmodp(uint64_t n) /* x^n mod P, in log(n) time */ { - uint64_t stack = ~(uint64_t)1; - uint32_t acc, low; - for (; n > 191; n = (n >> 1) - 16) { - stack = (stack << 1) + (n & 1); - } - stack = ~stack; - acc = ((uint32_t)0x80000000) >> (n & 31); - for (n >>= 5; n; --n) { - acc = __crc32cw(acc, 0); - } - while ((low = stack & 1), stack >>= 1) { - poly8x8_t x = vreinterpret_p8_u64(vmov_n_u64(acc)); - uint64_t y = vgetq_lane_u64(vreinterpretq_u64_p16(vmull_p8(x, x)), 0); - acc = __crc32cd(0, y << low); - } - return acc; -} - -CRC_AINLINE uint64x2_t crc_shift(uint32_t crc, size_t nbytes) { - return clmul_scalar(crc, xnmodp(nbytes * 8 - 33)); -} - -CRC_AINLINE uint32_t crc32_iscsi_large_impl(uint32_t crc0, const char* buf, size_t len) { - crc0 = ~crc0; - for (; len && ((uintptr_t)buf & 7); --len) { - crc0 = __crc32cb(crc0, *buf++); - } - if (((uintptr_t)buf & 8) && len >= 8) { - crc0 = __crc32cd(crc0, *(const uint64_t*)buf); - buf += 8; - len -= 8; - } - if (len >= 192) { - const char* end = buf + len; - size_t blk = (len - 0) / 192; - size_t klen = blk * 16; - const char* buf2 = buf + klen * 3; - const char* limit = buf + klen - 32; - uint32_t crc1 = 0; - uint32_t crc2 = 0; - uint64x2_t vc0; - uint64x2_t vc1; - uint64x2_t vc2; - uint64_t vc; - /* First vector chunk. */ - uint64x2_t x0 = vld1q_u64((const uint64_t*)buf2), y0; - uint64x2_t x1 = vld1q_u64((const uint64_t*)(buf2 + 16)), y1; - uint64x2_t x2 = vld1q_u64((const uint64_t*)(buf2 + 32)), y2; - uint64x2_t x3 = vld1q_u64((const uint64_t*)(buf2 + 48)), y3; - uint64x2_t x4 = vld1q_u64((const uint64_t*)(buf2 + 64)), y4; - uint64x2_t x5 = vld1q_u64((const uint64_t*)(buf2 + 80)), y5; - uint64x2_t x6 = vld1q_u64((const uint64_t*)(buf2 + 96)), y6; - uint64x2_t x7 = vld1q_u64((const uint64_t*)(buf2 + 112)), y7; - uint64x2_t x8 = vld1q_u64((const uint64_t*)(buf2 + 128)), y8; - uint64x2_t k; - { static const uint64_t CRC_ALIGN(16) k_[] = {0x7e908048, 0xc96cfdc0}; k = vld1q_u64(k_); } - buf2 += 144; - /* Main loop. */ - while (buf <= limit) { - y0 = clmul_lo_eor3(x0, k), x0 = clmul_hi_eor3(x0, k); - y1 = clmul_lo_eor3(x1, k), x1 = clmul_hi_eor3(x1, k); - y2 = clmul_lo_eor3(x2, k), x2 = clmul_hi_eor3(x2, k); - y3 = clmul_lo_eor3(x3, k), x3 = clmul_hi_eor3(x3, k); - y4 = clmul_lo_eor3(x4, k), x4 = clmul_hi_eor3(x4, k); - y5 = clmul_lo_eor3(x5, k), x5 = clmul_hi_eor3(x5, k); - y6 = clmul_lo_eor3(x6, k), x6 = clmul_hi_eor3(x6, k); - y7 = clmul_lo_eor3(x7, k), x7 = clmul_hi_eor3(x7, k); - y8 = clmul_lo_eor3(x8, k), x8 = clmul_hi_eor3(x8, k); - x0 = veor3q_u64(x0, y0, vld1q_u64((const uint64_t*)buf2)); - x1 = veor3q_u64(x1, y1, vld1q_u64((const uint64_t*)(buf2 + 16))); - x2 = veor3q_u64(x2, y2, vld1q_u64((const uint64_t*)(buf2 + 32))); - x3 = veor3q_u64(x3, y3, vld1q_u64((const uint64_t*)(buf2 + 48))); - x4 = veor3q_u64(x4, y4, vld1q_u64((const uint64_t*)(buf2 + 64))); - x5 = veor3q_u64(x5, y5, vld1q_u64((const uint64_t*)(buf2 + 80))); - x6 = veor3q_u64(x6, y6, vld1q_u64((const uint64_t*)(buf2 + 96))); - x7 = veor3q_u64(x7, y7, vld1q_u64((const uint64_t*)(buf2 + 112))); - x8 = veor3q_u64(x8, y8, vld1q_u64((const uint64_t*)(buf2 + 128))); - crc0 = __crc32cd(crc0, *(const uint64_t*)buf); - crc1 = __crc32cd(crc1, *(const uint64_t*)(buf + klen)); - crc2 = __crc32cd(crc2, *(const uint64_t*)(buf + klen * 2)); - crc0 = __crc32cd(crc0, *(const uint64_t*)(buf + 8)); - crc1 = __crc32cd(crc1, *(const uint64_t*)(buf + klen + 8)); - crc2 = __crc32cd(crc2, *(const uint64_t*)(buf + klen * 2 + 8)); - buf += 16; - buf2 += 144; - } - /* Reduce x0 ... x8 to just x0. */ - { static const uint64_t CRC_ALIGN(16) k_[] = {0xf20c0dfe, 0x493c7d27}; k = vld1q_u64(k_); } - y0 = clmul_lo_eor3(x0, k), x0 = clmul_hi_eor3(x0, k); - x0 = veor3q_u64(x0, y0, x1); - x1 = x2, x2 = x3, x3 = x4, x4 = x5, x5 = x6, x6 = x7, x7 = x8; - y0 = clmul_lo_eor3(x0, k), x0 = clmul_hi_eor3(x0, k); - y2 = clmul_lo_eor3(x2, k), x2 = clmul_hi_eor3(x2, k); - y4 = clmul_lo_eor3(x4, k), x4 = clmul_hi_eor3(x4, k); - y6 = clmul_lo_eor3(x6, k), x6 = clmul_hi_eor3(x6, k); - x0 = veor3q_u64(x0, y0, x1); - x2 = veor3q_u64(x2, y2, x3); - x4 = veor3q_u64(x4, y4, x5); - x6 = veor3q_u64(x6, y6, x7); - { static const uint64_t CRC_ALIGN(16) k_[] = {0x3da6d0cb, 0xba4fc28e}; k = vld1q_u64(k_); } - y0 = clmul_lo_eor3(x0, k), x0 = clmul_hi_eor3(x0, k); - y4 = clmul_lo_eor3(x4, k), x4 = clmul_hi_eor3(x4, k); - x0 = veor3q_u64(x0, y0, x2); - x4 = veor3q_u64(x4, y4, x6); - { static const uint64_t CRC_ALIGN(16) k_[] = {0x740eef02, 0x9e4addf8}; k = vld1q_u64(k_); } - y0 = clmul_lo_eor3(x0, k), x0 = clmul_hi_eor3(x0, k); - x0 = veor3q_u64(x0, y0, x4); - /* Final scalar chunk. */ - crc0 = __crc32cd(crc0, *(const uint64_t*)buf); - crc1 = __crc32cd(crc1, *(const uint64_t*)(buf + klen)); - crc2 = __crc32cd(crc2, *(const uint64_t*)(buf + klen * 2)); - crc0 = __crc32cd(crc0, *(const uint64_t*)(buf + 8)); - crc1 = __crc32cd(crc1, *(const uint64_t*)(buf + klen + 8)); - crc2 = __crc32cd(crc2, *(const uint64_t*)(buf + klen * 2 + 8)); - vc0 = crc_shift(crc0, klen * 2 + blk * 144); - vc1 = crc_shift(crc1, klen + blk * 144); - vc2 = crc_shift(crc2, 0 + blk * 144); - vc = vgetq_lane_u64(veor3q_u64(vc0, vc1, vc2), 0); - /* Reduce 128 bits to 32 bits, and multiply by x^32. */ - crc0 = __crc32cd(0, vgetq_lane_u64(x0, 0)); - crc0 = __crc32cd(crc0, vc ^ vgetq_lane_u64(x0, 1)); - buf = buf2; - len = end - buf; - } - if (len >= 32) { - size_t klen = ((len - 8) / 24) * 8; - uint32_t crc1 = 0; - uint32_t crc2 = 0; - uint64x2_t vc0; - uint64x2_t vc1; - uint64_t vc; - /* Main loop. */ - do { - crc0 = __crc32cd(crc0, *(const uint64_t*)buf); - crc1 = __crc32cd(crc1, *(const uint64_t*)(buf + klen)); - crc2 = __crc32cd(crc2, *(const uint64_t*)(buf + klen * 2)); - buf += 8; - len -= 24; - } while (len >= 32); - vc0 = crc_shift(crc0, klen * 2 + 8); - vc1 = crc_shift(crc1, klen + 8); - vc = vgetq_lane_u64(veorq_u64(vc0, vc1), 0); - /* Final 8 bytes. */ - buf += klen * 2; - crc0 = crc2; - crc0 = __crc32cd(crc0, *(const uint64_t*)buf ^ vc), buf += 8; - len -= 8; - } - for (; len >= 8; buf += 8, len -= 8) { - crc0 = __crc32cd(crc0, *(const uint64_t*)buf); - } - for (; len; --len) { - crc0 = __crc32cb(crc0, *buf++); - } - return ~crc0; -} - - -CRC_AINLINE uint64x2_t clmul_lo_e(uint64x2_t a, uint64x2_t b, uint64x2_t c) { - uint64x2_t r; - __asm("pmull %0.1q, %2.1d, %3.1d\neor %0.16b, %0.16b, %1.16b\n" : "=w"(r), "+w"(c) : "w"(a), "w"(b)); - return r; -} - -CRC_AINLINE uint64x2_t clmul_hi_e(uint64x2_t a, uint64x2_t b, uint64x2_t c) { - uint64x2_t r; - __asm("pmull2 %0.1q, %2.2d, %3.2d\neor %0.16b, %0.16b, %1.16b\n" : "=w"(r), "+w"(c) : "w"(a), "w"(b)); - return r; -} - -CRC_AINLINE uint32_t crc32_iscsi_small_impl(uint32_t crc0, const char* buf, size_t len) { - crc0 = ~crc0; - for (; len && ((uintptr_t)buf & 7); --len) { - crc0 = __crc32cb(crc0, *buf++); - } - if (((uintptr_t)buf & 8) && len >= 8) { - crc0 = __crc32cd(crc0, *(const uint64_t*)buf); - buf += 8; - len -= 8; - } - if (len >= 192) { - const char* end = buf + len; - const char* limit = buf + len - 192; - /* First vector chunk. */ - uint64x2_t x0 = vld1q_u64((const uint64_t*)buf), y0; - uint64x2_t x1 = vld1q_u64((const uint64_t*)(buf + 16)), y1; - uint64x2_t x2 = vld1q_u64((const uint64_t*)(buf + 32)), y2; - uint64x2_t x3 = vld1q_u64((const uint64_t*)(buf + 48)), y3; - uint64x2_t x4 = vld1q_u64((const uint64_t*)(buf + 64)), y4; - uint64x2_t x5 = vld1q_u64((const uint64_t*)(buf + 80)), y5; - uint64x2_t x6 = vld1q_u64((const uint64_t*)(buf + 96)), y6; - uint64x2_t x7 = vld1q_u64((const uint64_t*)(buf + 112)), y7; - uint64x2_t x8 = vld1q_u64((const uint64_t*)(buf + 128)), y8; - uint64x2_t x9 = vld1q_u64((const uint64_t*)(buf + 144)), y9; - uint64x2_t x10 = vld1q_u64((const uint64_t*)(buf + 160)), y10; - uint64x2_t x11 = vld1q_u64((const uint64_t*)(buf + 176)), y11; - uint64x2_t k; - { static const uint64_t CRC_ALIGN(16) k_[] = {0xa87ab8a8, 0xab7aff2a}; k = vld1q_u64(k_); } - x0 = veorq_u64((uint64x2_t){crc0, 0}, x0); - buf += 192; - /* Main loop. */ - while (buf <= limit) { - y0 = clmul_lo_e(x0, k, vld1q_u64((const uint64_t*)buf)), x0 = clmul_hi_e(x0, k, y0); - y1 = clmul_lo_e(x1, k, vld1q_u64((const uint64_t*)(buf + 16))), x1 = clmul_hi_e(x1, k, y1); - y2 = clmul_lo_e(x2, k, vld1q_u64((const uint64_t*)(buf + 32))), x2 = clmul_hi_e(x2, k, y2); - y3 = clmul_lo_e(x3, k, vld1q_u64((const uint64_t*)(buf + 48))), x3 = clmul_hi_e(x3, k, y3); - y4 = clmul_lo_e(x4, k, vld1q_u64((const uint64_t*)(buf + 64))), x4 = clmul_hi_e(x4, k, y4); - y5 = clmul_lo_e(x5, k, vld1q_u64((const uint64_t*)(buf + 80))), x5 = clmul_hi_e(x5, k, y5); - y6 = clmul_lo_e(x6, k, vld1q_u64((const uint64_t*)(buf + 96))), x6 = clmul_hi_e(x6, k, y6); - y7 = clmul_lo_e(x7, k, vld1q_u64((const uint64_t*)(buf + 112))), x7 = clmul_hi_e(x7, k, y7); - y8 = clmul_lo_e(x8, k, vld1q_u64((const uint64_t*)(buf + 128))), x8 = clmul_hi_e(x8, k, y8); - y9 = clmul_lo_e(x9, k, vld1q_u64((const uint64_t*)(buf + 144))), x9 = clmul_hi_e(x9, k, y9); - y10 = clmul_lo_e(x10, k, vld1q_u64((const uint64_t*)(buf + 160))), x10 = clmul_hi_e(x10, k, y10); - y11 = clmul_lo_e(x11, k, vld1q_u64((const uint64_t*)(buf + 176))), x11 = clmul_hi_e(x11, k, y11); - buf += 192; - } - /* Reduce x0 ... x11 to just x0. */ - { static const uint64_t CRC_ALIGN(16) k_[] = {0xf20c0dfe, 0x493c7d27}; k = vld1q_u64(k_); } - y0 = clmul_lo_e(x0, k, x1), x0 = clmul_hi_e(x0, k, y0); - y2 = clmul_lo_e(x2, k, x3), x2 = clmul_hi_e(x2, k, y2); - y4 = clmul_lo_e(x4, k, x5), x4 = clmul_hi_e(x4, k, y4); - y6 = clmul_lo_e(x6, k, x7), x6 = clmul_hi_e(x6, k, y6); - y8 = clmul_lo_e(x8, k, x9), x8 = clmul_hi_e(x8, k, y8); - y10 = clmul_lo_e(x10, k, x11), x10 = clmul_hi_e(x10, k, y10); - { static const uint64_t CRC_ALIGN(16) k_[] = {0x3da6d0cb, 0xba4fc28e}; k = vld1q_u64(k_); } - y0 = clmul_lo_e(x0, k, x2), x0 = clmul_hi_e(x0, k, y0); - y4 = clmul_lo_e(x4, k, x6), x4 = clmul_hi_e(x4, k, y4); - y8 = clmul_lo_e(x8, k, x10), x8 = clmul_hi_e(x8, k, y8); - { static const uint64_t CRC_ALIGN(16) k_[] = {0x740eef02, 0x9e4addf8}; k = vld1q_u64(k_); } - y0 = clmul_lo_e(x0, k, x4), x0 = clmul_hi_e(x0, k, y0); - x4 = x8; - y0 = clmul_lo_e(x0, k, x4), x0 = clmul_hi_e(x0, k, y0); - /* Reduce 128 bits to 32 bits, and multiply by x^32. */ - crc0 = __crc32cd(0, vgetq_lane_u64(x0, 0)); - crc0 = __crc32cd(crc0, vgetq_lane_u64(x0, 1)); - len = end - buf; - } - if (len >= 16) { - /* First vector chunk. */ - uint64x2_t x0 = vld1q_u64((const uint64_t*)buf), y0; - uint64x2_t k; - { static const uint64_t CRC_ALIGN(16) k_[] = {0xf20c0dfe, 0x493c7d27}; k = vld1q_u64(k_); } - x0 = veorq_u64((uint64x2_t){crc0, 0}, x0); - buf += 16; - len -= 16; - /* Main loop. */ - while (len >= 16) { - y0 = clmul_lo_e(x0, k, vld1q_u64((const uint64_t*)buf)), x0 = clmul_hi_e(x0, k, y0); - buf += 16; - len -= 16; - } - /* Reduce 128 bits to 32 bits, and multiply by x^32. */ - crc0 = __crc32cd(0, vgetq_lane_u64(x0, 0)); - crc0 = __crc32cd(crc0, vgetq_lane_u64(x0, 1)); - } - for (; len >= 8; buf += 8, len -= 8) { - crc0 = __crc32cd(crc0, *(const uint64_t*)buf); - } - for (; len; --len) { - crc0 = __crc32cb(crc0, *buf++); - } - return ~crc0; -} - -CRC_EXPORT uint32_t crc32_iscsi_impl(uint32_t crc0, const char* buf, size_t len) { - // Define 1 KiB threshold (1024 bytes) - const size_t LARGE_BUFFER_THRESHOLD = 1024; - - // Select implementation based on buffer size - if (len <= LARGE_BUFFER_THRESHOLD) { - return crc32_iscsi_small_impl(crc0, buf, len); - } else { - return crc32_iscsi_large_impl(crc0, buf, len); - } -} diff --git a/include/crc32_iscsi_neon_eor3_v9s3x2e_s3.c b/include/crc32_iscsi_neon_eor3_v9s3x2e_s3.c deleted file mode 100644 index 2f672e9..0000000 --- a/include/crc32_iscsi_neon_eor3_v9s3x2e_s3.c +++ /dev/null @@ -1,200 +0,0 @@ -/* Generated by https://github.com/corsix/fast-crc32/ using: */ -/* ./generate -i neon_eor3 -p crc32c -a v9s3x2e_s3 */ -/* Modified slightly post-generation to improve function name and include build target */ -/* MIT licensed */ - -#include -#include -#include -#include - -#if defined(_MSC_VER) -#define CRC_AINLINE static __forceinline -#define CRC_ALIGN(n) __declspec(align(n)) -#else -#define CRC_AINLINE static __inline __attribute__((always_inline)) -#define CRC_ALIGN(n) __attribute__((aligned(n))) -#endif -#define CRC_EXPORT extern - -const char *const ISCSI_TARGET = "aarch64_neon_eor3_v9s3x2e_s3"; - -const char *get_iscsi_target() { - return ISCSI_TARGET; -} - -CRC_AINLINE uint64x2_t clmul_lo(uint64x2_t a, uint64x2_t b) { - uint64x2_t r; - __asm("pmull %0.1q, %1.1d, %2.1d\n" : "=w"(r) : "w"(a), "w"(b)); - return r; -} - -CRC_AINLINE uint64x2_t clmul_hi(uint64x2_t a, uint64x2_t b) { - uint64x2_t r; - __asm("pmull2 %0.1q, %1.2d, %2.2d\n" : "=w"(r) : "w"(a), "w"(b)); - return r; -} - -CRC_AINLINE uint64x2_t clmul_scalar(uint32_t a, uint32_t b) { - uint64x2_t r; - __asm("pmull %0.1q, %1.1d, %2.1d\n" : "=w"(r) : "w"(vmovq_n_u64(a)), "w"(vmovq_n_u64(b))); - return r; -} - -static uint32_t xnmodp(uint64_t n) /* x^n mod P, in log(n) time */ { - uint64_t stack = ~(uint64_t)1; - uint32_t acc, low; - for (; n > 191; n = (n >> 1) - 16) { - stack = (stack << 1) + (n & 1); - } - stack = ~stack; - acc = ((uint32_t)0x80000000) >> (n & 31); - for (n >>= 5; n; --n) { - acc = __crc32cw(acc, 0); - } - while ((low = stack & 1), stack >>= 1) { - poly8x8_t x = vreinterpret_p8_u64(vmov_n_u64(acc)); - uint64_t y = vgetq_lane_u64(vreinterpretq_u64_p16(vmull_p8(x, x)), 0); - acc = __crc32cd(0, y << low); - } - return acc; -} - -CRC_AINLINE uint64x2_t crc_shift(uint32_t crc, size_t nbytes) { - return clmul_scalar(crc, xnmodp(nbytes * 8 - 33)); -} - -CRC_EXPORT uint32_t crc32_iscsi_impl(uint32_t crc0, const char* buf, size_t len) { - crc0 = ~crc0; - for (; len && ((uintptr_t)buf & 7); --len) { - crc0 = __crc32cb(crc0, *buf++); - } - if (((uintptr_t)buf & 8) && len >= 8) { - crc0 = __crc32cd(crc0, *(const uint64_t*)buf); - buf += 8; - len -= 8; - } - if (len >= 192) { - const char* end = buf + len; - size_t blk = (len - 0) / 192; - size_t klen = blk * 16; - const char* buf2 = buf + klen * 3; - const char* limit = buf + klen - 32; - uint32_t crc1 = 0; - uint32_t crc2 = 0; - uint64x2_t vc0; - uint64x2_t vc1; - uint64x2_t vc2; - uint64_t vc; - /* First vector chunk. */ - uint64x2_t x0 = vld1q_u64((const uint64_t*)buf2), y0; - uint64x2_t x1 = vld1q_u64((const uint64_t*)(buf2 + 16)), y1; - uint64x2_t x2 = vld1q_u64((const uint64_t*)(buf2 + 32)), y2; - uint64x2_t x3 = vld1q_u64((const uint64_t*)(buf2 + 48)), y3; - uint64x2_t x4 = vld1q_u64((const uint64_t*)(buf2 + 64)), y4; - uint64x2_t x5 = vld1q_u64((const uint64_t*)(buf2 + 80)), y5; - uint64x2_t x6 = vld1q_u64((const uint64_t*)(buf2 + 96)), y6; - uint64x2_t x7 = vld1q_u64((const uint64_t*)(buf2 + 112)), y7; - uint64x2_t x8 = vld1q_u64((const uint64_t*)(buf2 + 128)), y8; - uint64x2_t k; - { static const uint64_t CRC_ALIGN(16) k_[] = {0x7e908048, 0xc96cfdc0}; k = vld1q_u64(k_); } - buf2 += 144; - /* Main loop. */ - while (buf <= limit) { - y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k); - y1 = clmul_lo(x1, k), x1 = clmul_hi(x1, k); - y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k); - y3 = clmul_lo(x3, k), x3 = clmul_hi(x3, k); - y4 = clmul_lo(x4, k), x4 = clmul_hi(x4, k); - y5 = clmul_lo(x5, k), x5 = clmul_hi(x5, k); - y6 = clmul_lo(x6, k), x6 = clmul_hi(x6, k); - y7 = clmul_lo(x7, k), x7 = clmul_hi(x7, k); - y8 = clmul_lo(x8, k), x8 = clmul_hi(x8, k); - x0 = veor3q_u64(x0, y0, vld1q_u64((const uint64_t*)buf2)); - x1 = veor3q_u64(x1, y1, vld1q_u64((const uint64_t*)(buf2 + 16))); - x2 = veor3q_u64(x2, y2, vld1q_u64((const uint64_t*)(buf2 + 32))); - x3 = veor3q_u64(x3, y3, vld1q_u64((const uint64_t*)(buf2 + 48))); - x4 = veor3q_u64(x4, y4, vld1q_u64((const uint64_t*)(buf2 + 64))); - x5 = veor3q_u64(x5, y5, vld1q_u64((const uint64_t*)(buf2 + 80))); - x6 = veor3q_u64(x6, y6, vld1q_u64((const uint64_t*)(buf2 + 96))); - x7 = veor3q_u64(x7, y7, vld1q_u64((const uint64_t*)(buf2 + 112))); - x8 = veor3q_u64(x8, y8, vld1q_u64((const uint64_t*)(buf2 + 128))); - crc0 = __crc32cd(crc0, *(const uint64_t*)buf); - crc1 = __crc32cd(crc1, *(const uint64_t*)(buf + klen)); - crc2 = __crc32cd(crc2, *(const uint64_t*)(buf + klen * 2)); - crc0 = __crc32cd(crc0, *(const uint64_t*)(buf + 8)); - crc1 = __crc32cd(crc1, *(const uint64_t*)(buf + klen + 8)); - crc2 = __crc32cd(crc2, *(const uint64_t*)(buf + klen * 2 + 8)); - buf += 16; - buf2 += 144; - } - /* Reduce x0 ... x8 to just x0. */ - { static const uint64_t CRC_ALIGN(16) k_[] = {0xf20c0dfe, 0x493c7d27}; k = vld1q_u64(k_); } - y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k); - x0 = veor3q_u64(x0, y0, x1); - x1 = x2, x2 = x3, x3 = x4, x4 = x5, x5 = x6, x6 = x7, x7 = x8; - y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k); - y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k); - y4 = clmul_lo(x4, k), x4 = clmul_hi(x4, k); - y6 = clmul_lo(x6, k), x6 = clmul_hi(x6, k); - x0 = veor3q_u64(x0, y0, x1); - x2 = veor3q_u64(x2, y2, x3); - x4 = veor3q_u64(x4, y4, x5); - x6 = veor3q_u64(x6, y6, x7); - { static const uint64_t CRC_ALIGN(16) k_[] = {0x3da6d0cb, 0xba4fc28e}; k = vld1q_u64(k_); } - y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k); - y4 = clmul_lo(x4, k), x4 = clmul_hi(x4, k); - x0 = veor3q_u64(x0, y0, x2); - x4 = veor3q_u64(x4, y4, x6); - { static const uint64_t CRC_ALIGN(16) k_[] = {0x740eef02, 0x9e4addf8}; k = vld1q_u64(k_); } - y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k); - x0 = veor3q_u64(x0, y0, x4); - /* Final scalar chunk. */ - crc0 = __crc32cd(crc0, *(const uint64_t*)buf); - crc1 = __crc32cd(crc1, *(const uint64_t*)(buf + klen)); - crc2 = __crc32cd(crc2, *(const uint64_t*)(buf + klen * 2)); - crc0 = __crc32cd(crc0, *(const uint64_t*)(buf + 8)); - crc1 = __crc32cd(crc1, *(const uint64_t*)(buf + klen + 8)); - crc2 = __crc32cd(crc2, *(const uint64_t*)(buf + klen * 2 + 8)); - vc0 = crc_shift(crc0, klen * 2 + blk * 144); - vc1 = crc_shift(crc1, klen + blk * 144); - vc2 = crc_shift(crc2, 0 + blk * 144); - vc = vgetq_lane_u64(veor3q_u64(vc0, vc1, vc2), 0); - /* Reduce 128 bits to 32 bits, and multiply by x^32. */ - crc0 = __crc32cd(0, vgetq_lane_u64(x0, 0)); - crc0 = __crc32cd(crc0, vc ^ vgetq_lane_u64(x0, 1)); - buf = buf2; - len = end - buf; - } - if (len >= 32) { - size_t klen = ((len - 8) / 24) * 8; - uint32_t crc1 = 0; - uint32_t crc2 = 0; - uint64x2_t vc0; - uint64x2_t vc1; - uint64_t vc; - /* Main loop. */ - do { - crc0 = __crc32cd(crc0, *(const uint64_t*)buf); - crc1 = __crc32cd(crc1, *(const uint64_t*)(buf + klen)); - crc2 = __crc32cd(crc2, *(const uint64_t*)(buf + klen * 2)); - buf += 8; - len -= 24; - } while (len >= 32); - vc0 = crc_shift(crc0, klen * 2 + 8); - vc1 = crc_shift(crc1, klen + 8); - vc = vgetq_lane_u64(veorq_u64(vc0, vc1), 0); - /* Final 8 bytes. */ - buf += klen * 2; - crc0 = crc2; - crc0 = __crc32cd(crc0, *(const uint64_t*)buf ^ vc), buf += 8; - len -= 8; - } - for (; len >= 8; buf += 8, len -= 8) { - crc0 = __crc32cd(crc0, *(const uint64_t*)buf); - } - for (; len; --len) { - crc0 = __crc32cb(crc0, *buf++); - } - return ~crc0; -} diff --git a/include/crc32_iscsi_neon_v12e_v1.c b/include/crc32_iscsi_neon_v12e_v1.c deleted file mode 100644 index 87438b0..0000000 --- a/include/crc32_iscsi_neon_v12e_v1.c +++ /dev/null @@ -1,130 +0,0 @@ -/* Generated by https://github.com/corsix/fast-crc32/ using: */ -/* ./generate -i neon -p crc32c -a v12e_v1 */ -/* Modified slightly post-generation to improve function name and include build target */ -/* MIT licensed */ - -#include -#include -#include -#include - -#if defined(_MSC_VER) -#define CRC_AINLINE static __forceinline -#define CRC_ALIGN(n) __declspec(align(n)) -#else -#define CRC_AINLINE static __inline __attribute__((always_inline)) -#define CRC_ALIGN(n) __attribute__((aligned(n))) -#endif -#define CRC_EXPORT extern - -const char *const ISCSI_TARGET = "aarch64_neon_v12e_v1"; - -const char *get_iscsi_target() { - return ISCSI_TARGET; -} - -CRC_AINLINE uint64x2_t clmul_lo_e(uint64x2_t a, uint64x2_t b, uint64x2_t c) { - uint64x2_t r; - __asm("pmull %0.1q, %2.1d, %3.1d\neor %0.16b, %0.16b, %1.16b\n" : "=w"(r), "+w"(c) : "w"(a), "w"(b)); - return r; -} - -CRC_AINLINE uint64x2_t clmul_hi_e(uint64x2_t a, uint64x2_t b, uint64x2_t c) { - uint64x2_t r; - __asm("pmull2 %0.1q, %2.2d, %3.2d\neor %0.16b, %0.16b, %1.16b\n" : "=w"(r), "+w"(c) : "w"(a), "w"(b)); - return r; -} - -CRC_EXPORT uint32_t crc32_iscsi_impl(uint32_t crc0, const char* buf, size_t len) { - crc0 = ~crc0; - for (; len && ((uintptr_t)buf & 7); --len) { - crc0 = __crc32cb(crc0, *buf++); - } - if (((uintptr_t)buf & 8) && len >= 8) { - crc0 = __crc32cd(crc0, *(const uint64_t*)buf); - buf += 8; - len -= 8; - } - if (len >= 192) { - const char* end = buf + len; - const char* limit = buf + len - 192; - /* First vector chunk. */ - uint64x2_t x0 = vld1q_u64((const uint64_t*)buf), y0; - uint64x2_t x1 = vld1q_u64((const uint64_t*)(buf + 16)), y1; - uint64x2_t x2 = vld1q_u64((const uint64_t*)(buf + 32)), y2; - uint64x2_t x3 = vld1q_u64((const uint64_t*)(buf + 48)), y3; - uint64x2_t x4 = vld1q_u64((const uint64_t*)(buf + 64)), y4; - uint64x2_t x5 = vld1q_u64((const uint64_t*)(buf + 80)), y5; - uint64x2_t x6 = vld1q_u64((const uint64_t*)(buf + 96)), y6; - uint64x2_t x7 = vld1q_u64((const uint64_t*)(buf + 112)), y7; - uint64x2_t x8 = vld1q_u64((const uint64_t*)(buf + 128)), y8; - uint64x2_t x9 = vld1q_u64((const uint64_t*)(buf + 144)), y9; - uint64x2_t x10 = vld1q_u64((const uint64_t*)(buf + 160)), y10; - uint64x2_t x11 = vld1q_u64((const uint64_t*)(buf + 176)), y11; - uint64x2_t k; - { static const uint64_t CRC_ALIGN(16) k_[] = {0xa87ab8a8, 0xab7aff2a}; k = vld1q_u64(k_); } - x0 = veorq_u64((uint64x2_t){crc0, 0}, x0); - buf += 192; - /* Main loop. */ - while (buf <= limit) { - y0 = clmul_lo_e(x0, k, vld1q_u64((const uint64_t*)buf)), x0 = clmul_hi_e(x0, k, y0); - y1 = clmul_lo_e(x1, k, vld1q_u64((const uint64_t*)(buf + 16))), x1 = clmul_hi_e(x1, k, y1); - y2 = clmul_lo_e(x2, k, vld1q_u64((const uint64_t*)(buf + 32))), x2 = clmul_hi_e(x2, k, y2); - y3 = clmul_lo_e(x3, k, vld1q_u64((const uint64_t*)(buf + 48))), x3 = clmul_hi_e(x3, k, y3); - y4 = clmul_lo_e(x4, k, vld1q_u64((const uint64_t*)(buf + 64))), x4 = clmul_hi_e(x4, k, y4); - y5 = clmul_lo_e(x5, k, vld1q_u64((const uint64_t*)(buf + 80))), x5 = clmul_hi_e(x5, k, y5); - y6 = clmul_lo_e(x6, k, vld1q_u64((const uint64_t*)(buf + 96))), x6 = clmul_hi_e(x6, k, y6); - y7 = clmul_lo_e(x7, k, vld1q_u64((const uint64_t*)(buf + 112))), x7 = clmul_hi_e(x7, k, y7); - y8 = clmul_lo_e(x8, k, vld1q_u64((const uint64_t*)(buf + 128))), x8 = clmul_hi_e(x8, k, y8); - y9 = clmul_lo_e(x9, k, vld1q_u64((const uint64_t*)(buf + 144))), x9 = clmul_hi_e(x9, k, y9); - y10 = clmul_lo_e(x10, k, vld1q_u64((const uint64_t*)(buf + 160))), x10 = clmul_hi_e(x10, k, y10); - y11 = clmul_lo_e(x11, k, vld1q_u64((const uint64_t*)(buf + 176))), x11 = clmul_hi_e(x11, k, y11); - buf += 192; - } - /* Reduce x0 ... x11 to just x0. */ - { static const uint64_t CRC_ALIGN(16) k_[] = {0xf20c0dfe, 0x493c7d27}; k = vld1q_u64(k_); } - y0 = clmul_lo_e(x0, k, x1), x0 = clmul_hi_e(x0, k, y0); - y2 = clmul_lo_e(x2, k, x3), x2 = clmul_hi_e(x2, k, y2); - y4 = clmul_lo_e(x4, k, x5), x4 = clmul_hi_e(x4, k, y4); - y6 = clmul_lo_e(x6, k, x7), x6 = clmul_hi_e(x6, k, y6); - y8 = clmul_lo_e(x8, k, x9), x8 = clmul_hi_e(x8, k, y8); - y10 = clmul_lo_e(x10, k, x11), x10 = clmul_hi_e(x10, k, y10); - { static const uint64_t CRC_ALIGN(16) k_[] = {0x3da6d0cb, 0xba4fc28e}; k = vld1q_u64(k_); } - y0 = clmul_lo_e(x0, k, x2), x0 = clmul_hi_e(x0, k, y0); - y4 = clmul_lo_e(x4, k, x6), x4 = clmul_hi_e(x4, k, y4); - y8 = clmul_lo_e(x8, k, x10), x8 = clmul_hi_e(x8, k, y8); - { static const uint64_t CRC_ALIGN(16) k_[] = {0x740eef02, 0x9e4addf8}; k = vld1q_u64(k_); } - y0 = clmul_lo_e(x0, k, x4), x0 = clmul_hi_e(x0, k, y0); - x4 = x8; - y0 = clmul_lo_e(x0, k, x4), x0 = clmul_hi_e(x0, k, y0); - /* Reduce 128 bits to 32 bits, and multiply by x^32. */ - crc0 = __crc32cd(0, vgetq_lane_u64(x0, 0)); - crc0 = __crc32cd(crc0, vgetq_lane_u64(x0, 1)); - len = end - buf; - } - if (len >= 16) { - /* First vector chunk. */ - uint64x2_t x0 = vld1q_u64((const uint64_t*)buf), y0; - uint64x2_t k; - { static const uint64_t CRC_ALIGN(16) k_[] = {0xf20c0dfe, 0x493c7d27}; k = vld1q_u64(k_); } - x0 = veorq_u64((uint64x2_t){crc0, 0}, x0); - buf += 16; - len -= 16; - /* Main loop. */ - while (len >= 16) { - y0 = clmul_lo_e(x0, k, vld1q_u64((const uint64_t*)buf)), x0 = clmul_hi_e(x0, k, y0); - buf += 16; - len -= 16; - } - /* Reduce 128 bits to 32 bits, and multiply by x^32. */ - crc0 = __crc32cd(0, vgetq_lane_u64(x0, 0)); - crc0 = __crc32cd(crc0, vgetq_lane_u64(x0, 1)); - } - for (; len >= 8; buf += 8, len -= 8) { - crc0 = __crc32cd(crc0, *(const uint64_t*)buf); - } - for (; len; --len) { - crc0 = __crc32cb(crc0, *buf++); - } - return ~crc0; -} \ No newline at end of file diff --git a/include/crc32_iscsi_neon_v3s4x2e_v2.c b/include/crc32_iscsi_neon_v3s4x2e_v2.c deleted file mode 100644 index a855e5f..0000000 --- a/include/crc32_iscsi_neon_v3s4x2e_v2.c +++ /dev/null @@ -1,169 +0,0 @@ -/* Generated by https://github.com/corsix/fast-crc32/ using: */ -/* ./generate -i neon -p crc32c -a v3s4x2e_v2 */ -/* Modified slightly post-generation to improve function name and include build target */ -/* MIT licensed */ - -#include -#include -#include -#include - -#if defined(_MSC_VER) -#define CRC_AINLINE static __forceinline -#define CRC_ALIGN(n) __declspec(align(n)) -#else -#define CRC_AINLINE static __inline __attribute__((always_inline)) -#define CRC_ALIGN(n) __attribute__((aligned(n))) -#endif -#define CRC_EXPORT extern - -const char *const ISCSI_TARGET = "aarch64_neon_v3s4x2e_v2"; - -const char *get_iscsi_target() { - return ISCSI_TARGET; -} - -CRC_AINLINE uint64x2_t clmul_lo_e(uint64x2_t a, uint64x2_t b, uint64x2_t c) { - uint64x2_t r; - __asm("pmull %0.1q, %2.1d, %3.1d\neor %0.16b, %0.16b, %1.16b\n" : "=w"(r), "+w"(c) : "w"(a), "w"(b)); - return r; -} - -CRC_AINLINE uint64x2_t clmul_hi_e(uint64x2_t a, uint64x2_t b, uint64x2_t c) { - uint64x2_t r; - __asm("pmull2 %0.1q, %2.2d, %3.2d\neor %0.16b, %0.16b, %1.16b\n" : "=w"(r), "+w"(c) : "w"(a), "w"(b)); - return r; -} - -CRC_AINLINE uint64x2_t clmul_scalar(uint32_t a, uint32_t b) { - uint64x2_t r; - __asm("pmull %0.1q, %1.1d, %2.1d\n" : "=w"(r) : "w"(vmovq_n_u64(a)), "w"(vmovq_n_u64(b))); - return r; -} - -static uint32_t xnmodp(uint64_t n) /* x^n mod P, in log(n) time */ { - uint64_t stack = ~(uint64_t)1; - uint32_t acc, low; - for (; n > 191; n = (n >> 1) - 16) { - stack = (stack << 1) + (n & 1); - } - stack = ~stack; - acc = ((uint32_t)0x80000000) >> (n & 31); - for (n >>= 5; n; --n) { - acc = __crc32cw(acc, 0); - } - while ((low = stack & 1), stack >>= 1) { - poly8x8_t x = vreinterpret_p8_u64(vmov_n_u64(acc)); - uint64_t y = vgetq_lane_u64(vreinterpretq_u64_p16(vmull_p8(x, x)), 0); - acc = __crc32cd(0, y << low); - } - return acc; -} - -CRC_AINLINE uint64x2_t crc_shift(uint32_t crc, size_t nbytes) { - return clmul_scalar(crc, xnmodp(nbytes * 8 - 33)); -} - -CRC_EXPORT uint32_t crc32_iscsi_impl(uint32_t crc0, const char* buf, size_t len) { - crc0 = ~crc0; - for (; len && ((uintptr_t)buf & 7); --len) { - crc0 = __crc32cb(crc0, *buf++); - } - if (((uintptr_t)buf & 8) && len >= 8) { - crc0 = __crc32cd(crc0, *(const uint64_t*)buf); - buf += 8; - len -= 8; - } - if (len >= 112) { - const char* end = buf + len; - size_t blk = (len - 0) / 112; - size_t klen = blk * 16; - const char* buf2 = buf + klen * 4; - const char* limit = buf + klen - 32; - uint32_t crc1 = 0; - uint32_t crc2 = 0; - uint32_t crc3 = 0; - uint64x2_t vc0; - uint64x2_t vc1; - uint64x2_t vc2; - uint64x2_t vc3; - uint64_t vc; - /* First vector chunk. */ - uint64x2_t x0 = vld1q_u64((const uint64_t*)buf2), y0; - uint64x2_t x1 = vld1q_u64((const uint64_t*)(buf2 + 16)), y1; - uint64x2_t x2 = vld1q_u64((const uint64_t*)(buf2 + 32)), y2; - uint64x2_t k; - { static const uint64_t CRC_ALIGN(16) k_[] = {0x1c291d04, 0xddc0152b}; k = vld1q_u64(k_); } - buf2 += 48; - /* Main loop. */ - while (buf <= limit) { - y0 = clmul_lo_e(x0, k, vld1q_u64((const uint64_t*)buf2)), x0 = clmul_hi_e(x0, k, y0); - y1 = clmul_lo_e(x1, k, vld1q_u64((const uint64_t*)(buf2 + 16))), x1 = clmul_hi_e(x1, k, y1); - y2 = clmul_lo_e(x2, k, vld1q_u64((const uint64_t*)(buf2 + 32))), x2 = clmul_hi_e(x2, k, y2); - crc0 = __crc32cd(crc0, *(const uint64_t*)buf); - crc1 = __crc32cd(crc1, *(const uint64_t*)(buf + klen)); - crc2 = __crc32cd(crc2, *(const uint64_t*)(buf + klen * 2)); - crc3 = __crc32cd(crc3, *(const uint64_t*)(buf + klen * 3)); - crc0 = __crc32cd(crc0, *(const uint64_t*)(buf + 8)); - crc1 = __crc32cd(crc1, *(const uint64_t*)(buf + klen + 8)); - crc2 = __crc32cd(crc2, *(const uint64_t*)(buf + klen * 2 + 8)); - crc3 = __crc32cd(crc3, *(const uint64_t*)(buf + klen * 3 + 8)); - buf += 16; - buf2 += 48; - } - /* Reduce x0 ... x2 to just x0. */ - { static const uint64_t CRC_ALIGN(16) k_[] = {0xf20c0dfe, 0x493c7d27}; k = vld1q_u64(k_); } - y0 = clmul_lo_e(x0, k, x1), x0 = clmul_hi_e(x0, k, y0); - x1 = x2; - y0 = clmul_lo_e(x0, k, x1), x0 = clmul_hi_e(x0, k, y0); - /* Final scalar chunk. */ - crc0 = __crc32cd(crc0, *(const uint64_t*)buf); - crc1 = __crc32cd(crc1, *(const uint64_t*)(buf + klen)); - crc2 = __crc32cd(crc2, *(const uint64_t*)(buf + klen * 2)); - crc3 = __crc32cd(crc3, *(const uint64_t*)(buf + klen * 3)); - crc0 = __crc32cd(crc0, *(const uint64_t*)(buf + 8)); - crc1 = __crc32cd(crc1, *(const uint64_t*)(buf + klen + 8)); - crc2 = __crc32cd(crc2, *(const uint64_t*)(buf + klen * 2 + 8)); - crc3 = __crc32cd(crc3, *(const uint64_t*)(buf + klen * 3 + 8)); - vc0 = crc_shift(crc0, klen * 3 + blk * 48); - vc1 = crc_shift(crc1, klen * 2 + blk * 48); - vc2 = crc_shift(crc2, klen + blk * 48); - vc3 = crc_shift(crc3, 0 + blk * 48); - vc = vgetq_lane_u64(veorq_u64(veorq_u64(vc0, vc1), veorq_u64(vc2, vc3)), 0); - /* Reduce 128 bits to 32 bits, and multiply by x^32. */ - crc0 = __crc32cd(0, vgetq_lane_u64(x0, 0)); - crc0 = __crc32cd(crc0, vc ^ vgetq_lane_u64(x0, 1)); - buf = buf2; - len = end - buf; - } - if (len >= 32) { - /* First vector chunk. */ - uint64x2_t x0 = vld1q_u64((const uint64_t*)buf), y0; - uint64x2_t x1 = vld1q_u64((const uint64_t*)(buf + 16)), y1; - uint64x2_t k; - { static const uint64_t CRC_ALIGN(16) k_[] = {0x3da6d0cb, 0xba4fc28e}; k = vld1q_u64(k_); } - x0 = veorq_u64((uint64x2_t){crc0, 0}, x0); - buf += 32; - len -= 32; - /* Main loop. */ - while (len >= 32) { - y0 = clmul_lo_e(x0, k, vld1q_u64((const uint64_t*)buf)), x0 = clmul_hi_e(x0, k, y0); - y1 = clmul_lo_e(x1, k, vld1q_u64((const uint64_t*)(buf + 16))), x1 = clmul_hi_e(x1, k, y1); - buf += 32; - len -= 32; - } - /* Reduce x0 ... x1 to just x0. */ - { static const uint64_t CRC_ALIGN(16) k_[] = {0xf20c0dfe, 0x493c7d27}; k = vld1q_u64(k_); } - y0 = clmul_lo_e(x0, k, x1), x0 = clmul_hi_e(x0, k, y0); - /* Reduce 128 bits to 32 bits, and multiply by x^32. */ - crc0 = __crc32cd(0, vgetq_lane_u64(x0, 0)); - crc0 = __crc32cd(crc0, vgetq_lane_u64(x0, 1)); - } - for (; len >= 8; buf += 8, len -= 8) { - crc0 = __crc32cd(crc0, *(const uint64_t*)buf); - } - for (; len; --len) { - crc0 = __crc32cb(crc0, *buf++); - } - return ~crc0; -} diff --git a/include/crc32_iscsi_sse_v4s3x3.c b/include/crc32_iscsi_sse_v4s3x3.c deleted file mode 100644 index 63725ea..0000000 --- a/include/crc32_iscsi_sse_v4s3x3.c +++ /dev/null @@ -1,223 +0,0 @@ -/* Generated by https://github.com/corsix/fast-crc32/ using: */ -/* ./generate -i sse -p crc32c -a v4s3x3 */ -/* Modified slightly post-generation to improve function name and include build target */ -/* MIT licensed */ -/* Modified for 32-bit compatibility */ - -#include -#include -#include -#include - -#if defined(_MSC_VER) -#define CRC_AINLINE static __forceinline -#define CRC_ALIGN(n) __declspec(align(n)) -#else -#define CRC_AINLINE static __inline __attribute__((always_inline)) -#define CRC_ALIGN(n) __attribute__((aligned(n))) -#endif -#define CRC_EXPORT extern - -#define clmul_lo(a, b) (_mm_clmulepi64_si128((a), (b), 0)) -#define clmul_hi(a, b) (_mm_clmulepi64_si128((a), (b), 17)) - -const char *const ISCSI_TARGET = "x86_sse_v4s3x3"; - -const char *get_iscsi_target() { - return ISCSI_TARGET; -} - -/* Platform-specific 64-bit handling */ -#if defined(__x86_64__) || defined(_M_X64) -/* 64-bit platform */ -CRC_AINLINE __m128i mm_cvtsi64_si128(uint64_t val) { - return _mm_cvtsi64_si128(val); -} - -CRC_AINLINE uint64_t mm_cvtsi128_si64(__m128i val) { - return _mm_cvtsi128_si64(val); -} - -CRC_AINLINE uint64_t mm_extract_epi64(__m128i val, int idx) { - /* Even on 64-bit platforms, we need to use constant indices */ - if (idx == 0) { - return _mm_cvtsi128_si64(val); - } else { - /* For the high 64 bits */ - return _mm_cvtsi128_si64(_mm_srli_si128(val, 8)); - } -} - -CRC_AINLINE uint32_t mm_crc32_u64(uint32_t crc, uint64_t val) { - return _mm_crc32_u64(crc, val); -} -#else -/* 32-bit platform */ -CRC_AINLINE __m128i mm_cvtsi64_si128(uint64_t val) { - /* Split 64-bit value into two 32-bit parts for 32-bit platform */ - __m128i result, temp; - result = _mm_cvtsi32_si128((uint32_t)val); /* Low 32 bits */ - temp = _mm_cvtsi32_si128((uint32_t)(val >> 32)); /* High 32 bits */ - - /* Shift high 32 bits to position 1 */ - temp = _mm_slli_si128(temp, 4); - - /* Combine low and high parts */ - result = _mm_or_si128(result, temp); - return result; -} - -CRC_AINLINE uint64_t mm_cvtsi128_si64(__m128i val) { - /* Combine two 32-bit values into one 64-bit result */ - uint32_t low = _mm_cvtsi128_si32(val); - uint32_t high = _mm_extract_epi32(val, 1); - return ((uint64_t)high << 32) | low; -} - -CRC_AINLINE uint64_t mm_extract_epi64(__m128i val, int idx) { - /* Extract 64 bits (two 32-bit values) */ - uint32_t low, high; - - if (idx == 0) { - low = _mm_cvtsi128_si32(val); - high = _mm_extract_epi32(val, 1); - } else { - low = _mm_extract_epi32(val, 2); - high = _mm_extract_epi32(val, 3); - } - - return ((uint64_t)high << 32) | low; -} - -CRC_AINLINE uint32_t mm_crc32_u64(uint32_t crc, uint64_t val) { - /* Process 64-bit value in two 32-bit chunks on 32-bit platforms */ - crc = _mm_crc32_u32(crc, (uint32_t)val); - crc = _mm_crc32_u32(crc, (uint32_t)(val >> 32)); - return crc; -} -#endif - -CRC_AINLINE __m128i clmul_scalar(uint32_t a, uint32_t b) { - return _mm_clmulepi64_si128(_mm_cvtsi32_si128(a), _mm_cvtsi32_si128(b), 0); -} - -static uint32_t xnmodp(uint64_t n) /* x^n mod P, in log(n) time */ { - uint64_t stack = ~(uint64_t)1; - uint32_t acc, low; - for (; n > 191; n = (n >> 1) - 16) { - stack = (stack << 1) + (n & 1); - } - stack = ~stack; - acc = ((uint32_t)0x80000000) >> (n & 31); - for (n >>= 5; n; --n) { - acc = _mm_crc32_u32(acc, 0); - } - while ((low = stack & 1), stack >>= 1) { - __m128i x = _mm_cvtsi32_si128(acc); - uint64_t y = mm_cvtsi128_si64(_mm_clmulepi64_si128(x, x, 0)); - acc = mm_crc32_u64(0, y << low); - } - return acc; -} - -CRC_AINLINE __m128i crc_shift(uint32_t crc, size_t nbytes) { - return clmul_scalar(crc, xnmodp(nbytes * 8 - 33)); -} - -CRC_EXPORT uint32_t crc32_iscsi_impl(uint32_t crc0, const char* buf, size_t len) { - crc0 = ~crc0; - for (; len && ((uintptr_t)buf & 7); --len) { - crc0 = _mm_crc32_u8(crc0, *buf++); - } - if (((uintptr_t)buf & 8) && len >= 8) { - crc0 = mm_crc32_u64(crc0, *(const uint64_t*)buf); - buf += 8; - len -= 8; - } - if (len >= 144) { - size_t blk = (len - 8) / 136; - size_t klen = blk * 24; - const char* buf2 = buf + 0; - uint32_t crc1 = 0; - uint32_t crc2 = 0; - __m128i vc0; - __m128i vc1; - uint64_t vc; - /* First vector chunk. */ - __m128i x0 = _mm_loadu_si128((const __m128i*)buf2), y0; - __m128i x1 = _mm_loadu_si128((const __m128i*)(buf2 + 16)), y1; - __m128i x2 = _mm_loadu_si128((const __m128i*)(buf2 + 32)), y2; - __m128i x3 = _mm_loadu_si128((const __m128i*)(buf2 + 48)), y3; - __m128i k; - k = _mm_setr_epi32(0x740eef02, 0, 0x9e4addf8, 0); - x0 = _mm_xor_si128(_mm_cvtsi32_si128(crc0), x0); - crc0 = 0; - buf2 += 64; - len -= 136; - buf += blk * 64; - /* Main loop. */ - while (len >= 144) { - y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k); - y1 = clmul_lo(x1, k), x1 = clmul_hi(x1, k); - y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k); - y3 = clmul_lo(x3, k), x3 = clmul_hi(x3, k); - y0 = _mm_xor_si128(y0, _mm_loadu_si128((const __m128i*)buf2)), x0 = _mm_xor_si128(x0, y0); - y1 = _mm_xor_si128(y1, _mm_loadu_si128((const __m128i*)(buf2 + 16))), x1 = _mm_xor_si128(x1, y1); - y2 = _mm_xor_si128(y2, _mm_loadu_si128((const __m128i*)(buf2 + 32))), x2 = _mm_xor_si128(x2, y2); - y3 = _mm_xor_si128(y3, _mm_loadu_si128((const __m128i*)(buf2 + 48))), x3 = _mm_xor_si128(x3, y3); - crc0 = mm_crc32_u64(crc0, *(const uint64_t*)buf); - crc1 = mm_crc32_u64(crc1, *(const uint64_t*)(buf + klen)); - crc2 = mm_crc32_u64(crc2, *(const uint64_t*)(buf + klen * 2)); - crc0 = mm_crc32_u64(crc0, *(const uint64_t*)(buf + 8)); - crc1 = mm_crc32_u64(crc1, *(const uint64_t*)(buf + klen + 8)); - crc2 = mm_crc32_u64(crc2, *(const uint64_t*)(buf + klen * 2 + 8)); - crc0 = mm_crc32_u64(crc0, *(const uint64_t*)(buf + 16)); - crc1 = mm_crc32_u64(crc1, *(const uint64_t*)(buf + klen + 16)); - crc2 = mm_crc32_u64(crc2, *(const uint64_t*)(buf + klen * 2 + 16)); - buf += 24; - buf2 += 64; - len -= 136; - } - /* Reduce x0 ... x3 to just x0. */ - k = _mm_setr_epi32(0xf20c0dfe, 0, 0x493c7d27, 0); - y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k); - y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k); - y0 = _mm_xor_si128(y0, x1), x0 = _mm_xor_si128(x0, y0); - y2 = _mm_xor_si128(y2, x3), x2 = _mm_xor_si128(x2, y2); - k = _mm_setr_epi32(0x3da6d0cb, 0, 0xba4fc28e, 0); - y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k); - y0 = _mm_xor_si128(y0, x2), x0 = _mm_xor_si128(x0, y0); - /* Final scalar chunk. */ - crc0 = mm_crc32_u64(crc0, *(const uint64_t*)buf); - crc1 = mm_crc32_u64(crc1, *(const uint64_t*)(buf + klen)); - crc2 = mm_crc32_u64(crc2, *(const uint64_t*)(buf + klen * 2)); - crc0 = mm_crc32_u64(crc0, *(const uint64_t*)(buf + 8)); - crc1 = mm_crc32_u64(crc1, *(const uint64_t*)(buf + klen + 8)); - crc2 = mm_crc32_u64(crc2, *(const uint64_t*)(buf + klen * 2 + 8)); - crc0 = mm_crc32_u64(crc0, *(const uint64_t*)(buf + 16)); - crc1 = mm_crc32_u64(crc1, *(const uint64_t*)(buf + klen + 16)); - crc2 = mm_crc32_u64(crc2, *(const uint64_t*)(buf + klen * 2 + 16)); - buf += 24; - vc0 = crc_shift(crc0, klen * 2 + 8); - vc1 = crc_shift(crc1, klen + 8); - vc = mm_extract_epi64(_mm_xor_si128(vc0, vc1), 0); - /* Reduce 128 bits to 32 bits, and multiply by x^32. */ - /* Extract the two 64-bit parts of x0 and combine them */ - uint64_t x0_low = mm_extract_epi64(x0, 0); - uint64_t x0_high = mm_extract_epi64(x0, 1); - uint64_t x0_combined = mm_extract_epi64(crc_shift(mm_crc32_u64(mm_crc32_u64(0, x0_low), x0_high), klen * 3 + 8), 0); - vc ^= x0_combined; - /* Final 8 bytes. */ - buf += klen * 2; - crc0 = crc2; - crc0 = mm_crc32_u64(crc0, *(const uint64_t*)buf ^ vc), buf += 8; - len -= 8; - } - for (; len >= 8; buf += 8, len -= 8) { - crc0 = mm_crc32_u64(crc0, *(const uint64_t*)buf); - } - for (; len; --len) { - crc0 = _mm_crc32_u8(crc0, *buf++); - } - return ~crc0; -} \ No newline at end of file diff --git a/include/crc32_iso_hdlc.h b/include/crc32_iso_hdlc.h deleted file mode 100644 index d5b990c..0000000 --- a/include/crc32_iso_hdlc.h +++ /dev/null @@ -1,40 +0,0 @@ -/* Generated header for hardware-accelerated CRC-32/ISO_HDLC implementation */ -/* Original implementation from https://github.com/corsix/fast-crc32/ */ -/* MIT licensed */ - -#ifndef CRC32_ISO_HDLC_H -#define CRC32_ISO_HDLC_H - -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -/* - * The target build properties (CPU architecture and fine-tuning parameters) for the compiled implementation. - */ -extern const char *const ISO_HDLC_TARGET; - -/** - * Gets the target build properties (CPU architecture and fine-tuning parameters) for this implementation. - */ -const char *get_iso_hdlc_target(void); - -/** - * Calculate CRC-32/ISO_HDLC checksum using hardware acceleration - * - * @param crc0 Initial CRC value (typically 0) - * @param buf Pointer to input data buffer - * @param len Length of input data in bytes - * - * @return Calculated CRC-32/ISO_HDLC checksum - */ -uint32_t crc32_iso_hdlc_impl(uint32_t crc0, const char* buf, size_t len); - -#ifdef __cplusplus -} -#endif - -#endif /* CRC32_ISO_HDLC_H */ \ No newline at end of file diff --git a/include/crc32_iso_hdlc_avx512_v4s3x3.c b/include/crc32_iso_hdlc_avx512_v4s3x3.c deleted file mode 100644 index 7e8624e..0000000 --- a/include/crc32_iso_hdlc_avx512_v4s3x3.c +++ /dev/null @@ -1,215 +0,0 @@ -/* Generated by https://github.com/corsix/fast-crc32/ using: */ -/* ./generate -i avx512 -p crc32 -a v4s3x3 */ -/* Modified slightly post-generation to improve function name and include build target */ -/* MIT licensed */ - -#include -#include -#include -#include -#include - -#if defined(_MSC_VER) -#define CRC_AINLINE static __forceinline -#define CRC_ALIGN(n) __declspec(align(n)) -#else -#define CRC_AINLINE static __inline __attribute__((always_inline)) -#define CRC_ALIGN(n) __attribute__((aligned(n))) -#endif -#define CRC_EXPORT extern - -static const uint32_t g_crc_table[1][256] = {{ - 0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x076dc419, 0x706af48f, - 0xe963a535, 0x9e6495a3, 0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988, - 0x09b64c2b, 0x7eb17cbd, 0xe7b82d07, 0x90bf1d91, 0x1db71064, 0x6ab020f2, - 0xf3b97148, 0x84be41de, 0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7, - 0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec, 0x14015c4f, 0x63066cd9, - 0xfa0f3d63, 0x8d080df5, 0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172, - 0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b, 0x35b5a8fa, 0x42b2986c, - 0xdbbbc9d6, 0xacbcf940, 0x32d86ce3, 0x45df5c75, 0xdcd60dcf, 0xabd13d59, - 0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116, 0x21b4f4b5, 0x56b3c423, - 0xcfba9599, 0xb8bda50f, 0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924, - 0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d, 0x76dc4190, 0x01db7106, - 0x98d220bc, 0xefd5102a, 0x71b18589, 0x06b6b51f, 0x9fbfe4a5, 0xe8b8d433, - 0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818, 0x7f6a0dbb, 0x086d3d2d, - 0x91646c97, 0xe6635c01, 0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e, - 0x6c0695ed, 0x1b01a57b, 0x8208f4c1, 0xf50fc457, 0x65b0d9c6, 0x12b7e950, - 0x8bbeb8ea, 0xfcb9887c, 0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, 0xfbd44c65, - 0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2, 0x4adfa541, 0x3dd895d7, - 0xa4d1c46d, 0xd3d6f4fb, 0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0, - 0x44042d73, 0x33031de5, 0xaa0a4c5f, 0xdd0d7cc9, 0x5005713c, 0x270241aa, - 0xbe0b1010, 0xc90c2086, 0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f, - 0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, 0x59b33d17, 0x2eb40d81, - 0xb7bd5c3b, 0xc0ba6cad, 0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a, - 0xead54739, 0x9dd277af, 0x04db2615, 0x73dc1683, 0xe3630b12, 0x94643b84, - 0x0d6d6a3e, 0x7a6a5aa8, 0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1, - 0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe, 0xf762575d, 0x806567cb, - 0x196c3671, 0x6e6b06e7, 0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc, - 0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5, 0xd6d6a3e8, 0xa1d1937e, - 0x38d8c2c4, 0x4fdff252, 0xd1bb67f1, 0xa6bc5767, 0x3fb506dd, 0x48b2364b, - 0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60, 0xdf60efc3, 0xa867df55, - 0x316e8eef, 0x4669be79, 0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236, - 0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f, 0xc5ba3bbe, 0xb2bd0b28, - 0x2bb45a92, 0x5cb36a04, 0xc2d7ffa7, 0xb5d0cf31, 0x2cd99e8b, 0x5bdeae1d, - 0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a, 0x9c0906a9, 0xeb0e363f, - 0x72076785, 0x05005713, 0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38, - 0x92d28e9b, 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21, 0x86d3d2d4, 0xf1d4e242, - 0x68ddb3f8, 0x1fda836e, 0x81be16cd, 0xf6b9265b, 0x6fb077e1, 0x18b74777, - 0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c, 0x8f659eff, 0xf862ae69, - 0x616bffd3, 0x166ccf45, 0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2, - 0xa7672661, 0xd06016f7, 0x4969474d, 0x3e6e77db, 0xaed16a4a, 0xd9d65adc, - 0x40df0b66, 0x37d83bf0, 0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9, - 0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, 0xbad03605, 0xcdd70693, - 0x54de5729, 0x23d967bf, 0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94, - 0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d -}}; - -const char *const ISO_HDLC_TARGET = "x86_64_avx512_v4s3x3"; - -const char *get_iso_hdlc_target() { - return ISO_HDLC_TARGET; -} - -CRC_AINLINE uint32_t crc_u8(uint32_t crc, uint8_t val) { - return (crc >> 8) ^ g_crc_table[0][(crc & 0xFF) ^ val]; -} - -CRC_AINLINE uint32_t crc_u64(uint32_t crc, uint64_t val) { - __m128i k = _mm_setr_epi32(0xf7011641, 0xb4e5b025, 0xdb710641, 1); - __m128i a = _mm_cvtsi64_si128(crc ^ val); - __m128i b = _mm_clmulepi64_si128(a, k, 0x00); - __m128i c = _mm_clmulepi64_si128(b, k, 0x10); - return _mm_extract_epi32(c, 2); -} - -#define clmul_lo(a, b) (_mm_clmulepi64_si128((a), (b), 0)) -#define clmul_hi(a, b) (_mm_clmulepi64_si128((a), (b), 17)) - -CRC_AINLINE __m128i clmul_scalar(uint32_t a, uint32_t b) { - return _mm_clmulepi64_si128(_mm_cvtsi32_si128(a), _mm_cvtsi32_si128(b), 0); -} - -CRC_AINLINE uint32_t crc_u32(uint32_t crc, uint32_t val) { - __m128i k = _mm_setr_epi32(0x00000000, 0xf7011641, 0xdb710641, 1); - __m128i a = _mm_cvtsi32_si128(crc ^ val); - __m128i b = _mm_clmulepi64_si128(a, k, 0x00); - __m128i c = _mm_clmulepi64_si128(b, k, 0x10); - return _mm_extract_epi32(c, 2); -} - -static uint32_t xnmodp(uint64_t n) /* x^n mod P, in log(n) time */ { - uint64_t stack = ~(uint64_t)1; - uint32_t acc, low; - for (; n > 191; n = (n >> 1) - 16) { - stack = (stack << 1) + (n & 1); - } - stack = ~stack; - acc = ((uint32_t)0x80000000) >> (n & 31); - for (n >>= 5; n; --n) { - acc = crc_u32(acc, 0); - } - while ((low = stack & 1), stack >>= 1) { - __m128i x = _mm_cvtsi32_si128(acc); - uint64_t y = _mm_cvtsi128_si64(_mm_clmulepi64_si128(x, x, 0)); - acc = crc_u64(0, y << low); - } - return acc; -} - -CRC_AINLINE __m128i crc_shift(uint32_t crc, size_t nbytes) { - return clmul_scalar(crc, xnmodp(nbytes * 8 - 33)); -} - -CRC_EXPORT uint32_t crc32_iso_hdlc_impl(uint32_t crc0, const char* buf, size_t len) { - crc0 = ~crc0; - for (; len && ((uintptr_t)buf & 7); --len) { - crc0 = crc_u8(crc0, *buf++); - } - if (((uintptr_t)buf & 8) && len >= 8) { - crc0 = crc_u64(crc0, *(const uint64_t*)buf); - buf += 8; - len -= 8; - } - if (len >= 144) { - size_t blk = (len - 8) / 136; - size_t klen = blk * 24; - const char* buf2 = buf + 0; - uint32_t crc1 = 0; - uint32_t crc2 = 0; - __m128i vc0; - __m128i vc1; - uint64_t vc; - /* First vector chunk. */ - __m128i x0 = _mm_loadu_si128((const __m128i*)buf2), y0; - __m128i x1 = _mm_loadu_si128((const __m128i*)(buf2 + 16)), y1; - __m128i x2 = _mm_loadu_si128((const __m128i*)(buf2 + 32)), y2; - __m128i x3 = _mm_loadu_si128((const __m128i*)(buf2 + 48)), y3; - __m128i k; - k = _mm_setr_epi32(0x8f352d95, 0, 0x1d9513d7, 0); - x0 = _mm_xor_si128(_mm_cvtsi32_si128(crc0), x0); - crc0 = 0; - buf2 += 64; - len -= 136; - buf += blk * 64; - /* Main loop. */ - while (len >= 144) { - y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k); - y1 = clmul_lo(x1, k), x1 = clmul_hi(x1, k); - y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k); - y3 = clmul_lo(x3, k), x3 = clmul_hi(x3, k); - x0 = _mm_ternarylogic_epi64(x0, y0, _mm_loadu_si128((const __m128i*)buf2), 0x96); - x1 = _mm_ternarylogic_epi64(x1, y1, _mm_loadu_si128((const __m128i*)(buf2 + 16)), 0x96); - x2 = _mm_ternarylogic_epi64(x2, y2, _mm_loadu_si128((const __m128i*)(buf2 + 32)), 0x96); - x3 = _mm_ternarylogic_epi64(x3, y3, _mm_loadu_si128((const __m128i*)(buf2 + 48)), 0x96); - crc0 = crc_u64(crc0, *(const uint64_t*)buf); - crc1 = crc_u64(crc1, *(const uint64_t*)(buf + klen)); - crc2 = crc_u64(crc2, *(const uint64_t*)(buf + klen * 2)); - crc0 = crc_u64(crc0, *(const uint64_t*)(buf + 8)); - crc1 = crc_u64(crc1, *(const uint64_t*)(buf + klen + 8)); - crc2 = crc_u64(crc2, *(const uint64_t*)(buf + klen * 2 + 8)); - crc0 = crc_u64(crc0, *(const uint64_t*)(buf + 16)); - crc1 = crc_u64(crc1, *(const uint64_t*)(buf + klen + 16)); - crc2 = crc_u64(crc2, *(const uint64_t*)(buf + klen * 2 + 16)); - buf += 24; - buf2 += 64; - len -= 136; - } - /* Reduce x0 ... x3 to just x0. */ - k = _mm_setr_epi32(0xae689191, 0, 0xccaa009e, 0); - y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k); - y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k); - x0 = _mm_ternarylogic_epi64(x0, y0, x1, 0x96); - x2 = _mm_ternarylogic_epi64(x2, y2, x3, 0x96); - k = _mm_setr_epi32(0xf1da05aa, 0, 0x81256527, 0); - y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k); - x0 = _mm_ternarylogic_epi64(x0, y0, x2, 0x96); - /* Final scalar chunk. */ - crc0 = crc_u64(crc0, *(const uint64_t*)buf); - crc1 = crc_u64(crc1, *(const uint64_t*)(buf + klen)); - crc2 = crc_u64(crc2, *(const uint64_t*)(buf + klen * 2)); - crc0 = crc_u64(crc0, *(const uint64_t*)(buf + 8)); - crc1 = crc_u64(crc1, *(const uint64_t*)(buf + klen + 8)); - crc2 = crc_u64(crc2, *(const uint64_t*)(buf + klen * 2 + 8)); - crc0 = crc_u64(crc0, *(const uint64_t*)(buf + 16)); - crc1 = crc_u64(crc1, *(const uint64_t*)(buf + klen + 16)); - crc2 = crc_u64(crc2, *(const uint64_t*)(buf + klen * 2 + 16)); - buf += 24; - vc0 = crc_shift(crc0, klen * 2 + 8); - vc1 = crc_shift(crc1, klen + 8); - vc = _mm_extract_epi64(_mm_xor_si128(vc0, vc1), 0); - /* Reduce 128 bits to 32 bits, and multiply by x^32. */ - vc ^= _mm_extract_epi64(crc_shift(crc_u64(crc_u64(0, _mm_extract_epi64(x0, 0)), _mm_extract_epi64(x0, 1)), klen * 3 + 8), 0); - /* Final 8 bytes. */ - buf += klen * 2; - crc0 = crc2; - crc0 = crc_u64(crc0, *(const uint64_t*)buf ^ vc), buf += 8; - len -= 8; - } - for (; len >= 8; buf += 8, len -= 8) { - crc0 = crc_u64(crc0, *(const uint64_t*)buf); - } - for (; len; --len) { - crc0 = crc_u8(crc0, *buf++); - } - return ~crc0; -} diff --git a/include/crc32_iso_hdlc_avx512_vpclmulqdq_v3x2.c b/include/crc32_iso_hdlc_avx512_vpclmulqdq_v3x2.c deleted file mode 100644 index e3836bb..0000000 --- a/include/crc32_iso_hdlc_avx512_vpclmulqdq_v3x2.c +++ /dev/null @@ -1,156 +0,0 @@ -/* Generated by https://github.com/corsix/fast-crc32/ using: */ -/* ./generate -i avx512_vpclmulqdq -p crc32 -a v3x2 */ -/* Modified slightly post-generation to improve function name and include build target */ -/* MIT licensed */ - -#include -#include -#include -#include -#include - -#if defined(_MSC_VER) -#define CRC_AINLINE static __forceinline -#define CRC_ALIGN(n) __declspec(align(n)) -#else -#define CRC_AINLINE static __inline __attribute__((always_inline)) -#define CRC_ALIGN(n) __attribute__((aligned(n))) -#endif -#define CRC_EXPORT extern - -static const uint32_t g_crc_table[1][256] = {{ - 0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x076dc419, 0x706af48f, - 0xe963a535, 0x9e6495a3, 0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988, - 0x09b64c2b, 0x7eb17cbd, 0xe7b82d07, 0x90bf1d91, 0x1db71064, 0x6ab020f2, - 0xf3b97148, 0x84be41de, 0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7, - 0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec, 0x14015c4f, 0x63066cd9, - 0xfa0f3d63, 0x8d080df5, 0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172, - 0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b, 0x35b5a8fa, 0x42b2986c, - 0xdbbbc9d6, 0xacbcf940, 0x32d86ce3, 0x45df5c75, 0xdcd60dcf, 0xabd13d59, - 0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116, 0x21b4f4b5, 0x56b3c423, - 0xcfba9599, 0xb8bda50f, 0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924, - 0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d, 0x76dc4190, 0x01db7106, - 0x98d220bc, 0xefd5102a, 0x71b18589, 0x06b6b51f, 0x9fbfe4a5, 0xe8b8d433, - 0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818, 0x7f6a0dbb, 0x086d3d2d, - 0x91646c97, 0xe6635c01, 0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e, - 0x6c0695ed, 0x1b01a57b, 0x8208f4c1, 0xf50fc457, 0x65b0d9c6, 0x12b7e950, - 0x8bbeb8ea, 0xfcb9887c, 0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, 0xfbd44c65, - 0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2, 0x4adfa541, 0x3dd895d7, - 0xa4d1c46d, 0xd3d6f4fb, 0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0, - 0x44042d73, 0x33031de5, 0xaa0a4c5f, 0xdd0d7cc9, 0x5005713c, 0x270241aa, - 0xbe0b1010, 0xc90c2086, 0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f, - 0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, 0x59b33d17, 0x2eb40d81, - 0xb7bd5c3b, 0xc0ba6cad, 0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a, - 0xead54739, 0x9dd277af, 0x04db2615, 0x73dc1683, 0xe3630b12, 0x94643b84, - 0x0d6d6a3e, 0x7a6a5aa8, 0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1, - 0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe, 0xf762575d, 0x806567cb, - 0x196c3671, 0x6e6b06e7, 0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc, - 0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5, 0xd6d6a3e8, 0xa1d1937e, - 0x38d8c2c4, 0x4fdff252, 0xd1bb67f1, 0xa6bc5767, 0x3fb506dd, 0x48b2364b, - 0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60, 0xdf60efc3, 0xa867df55, - 0x316e8eef, 0x4669be79, 0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236, - 0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f, 0xc5ba3bbe, 0xb2bd0b28, - 0x2bb45a92, 0x5cb36a04, 0xc2d7ffa7, 0xb5d0cf31, 0x2cd99e8b, 0x5bdeae1d, - 0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a, 0x9c0906a9, 0xeb0e363f, - 0x72076785, 0x05005713, 0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38, - 0x92d28e9b, 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21, 0x86d3d2d4, 0xf1d4e242, - 0x68ddb3f8, 0x1fda836e, 0x81be16cd, 0xf6b9265b, 0x6fb077e1, 0x18b74777, - 0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c, 0x8f659eff, 0xf862ae69, - 0x616bffd3, 0x166ccf45, 0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2, - 0xa7672661, 0xd06016f7, 0x4969474d, 0x3e6e77db, 0xaed16a4a, 0xd9d65adc, - 0x40df0b66, 0x37d83bf0, 0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9, - 0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, 0xbad03605, 0xcdd70693, - 0x54de5729, 0x23d967bf, 0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94, - 0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d -}}; - -const char *const ISO_HDLC_TARGET = "x86_64_avx512_vpclmulqdq_v3x2"; - -const char *get_iso_hdlc_target() { - return ISO_HDLC_TARGET; -} - -CRC_AINLINE uint32_t crc_u8(uint32_t crc, uint8_t val) { - return (crc >> 8) ^ g_crc_table[0][(crc & 0xFF) ^ val]; -} - -CRC_AINLINE uint32_t crc_u64(uint32_t crc, uint64_t val) { - __m128i k = _mm_setr_epi32(0xf7011641, 0xb4e5b025, 0xdb710641, 1); - __m128i a = _mm_cvtsi64_si128(crc ^ val); - __m128i b = _mm_clmulepi64_si128(a, k, 0x00); - __m128i c = _mm_clmulepi64_si128(b, k, 0x10); - return _mm_extract_epi32(c, 2); -} - -#define clmul_lo(a, b) (_mm512_clmulepi64_epi128((a), (b), 0)) -#define clmul_hi(a, b) (_mm512_clmulepi64_epi128((a), (b), 17)) - -CRC_EXPORT uint32_t crc32_iso_hdlc_impl(uint32_t crc0, const char* buf, size_t len) { - crc0 = ~crc0; - for (; len && ((uintptr_t)buf & 7); --len) { - crc0 = crc_u8(crc0, *buf++); - } - while (((uintptr_t)buf & 56) && len >= 8) { - crc0 = crc_u64(crc0, *(const uint64_t*)buf); - buf += 8; - len -= 8; - } - if (len >= 384) { - __m128i z0; - /* First vector chunk. */ - __m512i x0 = _mm512_loadu_si512((const void*)buf), y0; - __m512i x1 = _mm512_loadu_si512((const void*)(buf + 64)), y1; - __m512i x2 = _mm512_loadu_si512((const void*)(buf + 128)), y2; - __m512i k; - k = _mm512_broadcast_i32x4(_mm_setr_epi32(0x596c8d81, 0, 0xf5e48c85, 0)); - x0 = _mm512_xor_si512(_mm512_castsi128_si512(_mm_cvtsi32_si128(crc0)), x0); - y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k); - y1 = clmul_lo(x1, k), x1 = clmul_hi(x1, k); - y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k); - x0 = _mm512_ternarylogic_epi64(x0, y0, _mm512_loadu_si512((const void*)(buf + 192)), 0x96); - x1 = _mm512_ternarylogic_epi64(x1, y1, _mm512_loadu_si512((const void*)(buf + 256)), 0x96); - x2 = _mm512_ternarylogic_epi64(x2, y2, _mm512_loadu_si512((const void*)(buf + 320)), 0x96); - buf += 384; - len -= 384; - /* Main loop. */ - while (len >= 384) { - y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k); - y1 = clmul_lo(x1, k), x1 = clmul_hi(x1, k); - y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k); - x0 = _mm512_ternarylogic_epi64(x0, y0, _mm512_loadu_si512((const void*)buf), 0x96); - x1 = _mm512_ternarylogic_epi64(x1, y1, _mm512_loadu_si512((const void*)(buf + 64)), 0x96); - x2 = _mm512_ternarylogic_epi64(x2, y2, _mm512_loadu_si512((const void*)(buf + 128)), 0x96); - y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k); - y1 = clmul_lo(x1, k), x1 = clmul_hi(x1, k); - y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k); - x0 = _mm512_ternarylogic_epi64(x0, y0, _mm512_loadu_si512((const void*)(buf + 192)), 0x96); - x1 = _mm512_ternarylogic_epi64(x1, y1, _mm512_loadu_si512((const void*)(buf + 256)), 0x96); - x2 = _mm512_ternarylogic_epi64(x2, y2, _mm512_loadu_si512((const void*)(buf + 320)), 0x96); - buf += 384; - len -= 384; - } - /* Reduce x0 ... x2 to just x0. */ - k = _mm512_broadcast_i32x4(_mm_setr_epi32(0x8f352d95, 0, 0x1d9513d7, 0)); - y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k); - x0 = _mm512_ternarylogic_epi64(x0, y0, x1, 0x96); - x1 = x2; - y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k); - x0 = _mm512_ternarylogic_epi64(x0, y0, x1, 0x96); - /* Reduce 512 bits to 128 bits. */ - k = _mm512_setr_epi32(0x3db1ecdc, 0, 0xaf449247, 0, 0xf1da05aa, 0, 0x81256527, 0, 0xae689191, 0, 0xccaa009e, 0, 0, 0, 0, 0); - y0 = clmul_lo(x0, k), k = clmul_hi(x0, k); - y0 = _mm512_xor_si512(y0, k); - z0 = _mm_ternarylogic_epi64(_mm512_castsi512_si128(y0), _mm512_extracti32x4_epi32(y0, 1), _mm512_extracti32x4_epi32(y0, 2), 0x96); - z0 = _mm_xor_si128(z0, _mm512_extracti32x4_epi32(x0, 3)); - /* Reduce 128 bits to 32 bits, and multiply by x^32. */ - crc0 = crc_u64(0, _mm_extract_epi64(z0, 0)); - crc0 = crc_u64(crc0, _mm_extract_epi64(z0, 1)); - } - for (; len >= 8; buf += 8, len -= 8) { - crc0 = crc_u64(crc0, *(const uint64_t*)buf); - } - for (; len; --len) { - crc0 = crc_u8(crc0, *buf++); - } - return ~crc0; -} diff --git a/include/crc32_iso_hdlc_neon_blended.c b/include/crc32_iso_hdlc_neon_blended.c deleted file mode 100644 index 9c1ff8e..0000000 --- a/include/crc32_iso_hdlc_neon_blended.c +++ /dev/null @@ -1,319 +0,0 @@ -/* Generated by https://github.com/corsix/fast-crc32/ */ -/* Modified post-generation to improve function names, include build targets, - and bifurcate large (>1KiB) and small payloads for optimized performance */ -/* MIT licensed */ - -#include -#include -#include -#include - -#if defined(_MSC_VER) -#define CRC_AINLINE static __forceinline -#define CRC_ALIGN(n) __declspec(align(n)) -#else -#define CRC_AINLINE static __inline __attribute__((always_inline)) -#define CRC_ALIGN(n) __attribute__((aligned(n))) -#endif -#define CRC_EXPORT extern - -const char *const ISO_HDLC_TARGET = "aarch64_neon_blended"; - -const char *get_iso_hdlc_target() { - return ISO_HDLC_TARGET; -} - -CRC_AINLINE uint64x2_t clmul_lo_eor3(uint64x2_t a, uint64x2_t b) { - uint64x2_t r; - __asm("pmull %0.1q, %1.1d, %2.1d\n" : "=w"(r) : "w"(a), "w"(b)); - return r; -} - -CRC_AINLINE uint64x2_t clmul_hi_eor3(uint64x2_t a, uint64x2_t b) { - uint64x2_t r; - __asm("pmull2 %0.1q, %1.2d, %2.2d\n" : "=w"(r) : "w"(a), "w"(b)); - return r; -} - -CRC_AINLINE uint64x2_t clmul_scalar(uint32_t a, uint32_t b) { - uint64x2_t r; - __asm("pmull %0.1q, %1.1d, %2.1d\n" : "=w"(r) : "w"(vmovq_n_u64(a)), "w"(vmovq_n_u64(b))); - return r; -} - -static uint32_t xnmodp(uint64_t n) /* x^n mod P, in log(n) time */ { - uint64_t stack = ~(uint64_t)1; - uint32_t acc, low; - for (; n > 191; n = (n >> 1) - 16) { - stack = (stack << 1) + (n & 1); - } - stack = ~stack; - acc = ((uint32_t)0x80000000) >> (n & 31); - for (n >>= 5; n; --n) { - acc = __crc32w(acc, 0); - } - while ((low = stack & 1), stack >>= 1) { - poly8x8_t x = vreinterpret_p8_u64(vmov_n_u64(acc)); - uint64_t y = vgetq_lane_u64(vreinterpretq_u64_p16(vmull_p8(x, x)), 0); - acc = __crc32d(0, y << low); - } - return acc; -} - -CRC_AINLINE uint64x2_t crc_shift(uint32_t crc, size_t nbytes) { - return clmul_scalar(crc, xnmodp(nbytes * 8 - 33)); -} - -CRC_AINLINE uint32_t crc32_iso_hdlc_large_impl(uint32_t crc0, const char* buf, size_t len) { - crc0 = ~crc0; - for (; len && ((uintptr_t)buf & 7); --len) { - crc0 = __crc32b(crc0, *buf++); - } - if (((uintptr_t)buf & 8) && len >= 8) { - crc0 = __crc32d(crc0, *(const uint64_t*)buf); - buf += 8; - len -= 8; - } - if (len >= 192) { - const char* end = buf + len; - size_t blk = (len - 0) / 192; - size_t klen = blk * 16; - const char* buf2 = buf + klen * 3; - const char* limit = buf + klen - 32; - uint32_t crc1 = 0; - uint32_t crc2 = 0; - uint64x2_t vc0; - uint64x2_t vc1; - uint64x2_t vc2; - uint64_t vc; - /* First vector chunk. */ - uint64x2_t x0 = vld1q_u64((const uint64_t*)buf2), y0; - uint64x2_t x1 = vld1q_u64((const uint64_t*)(buf2 + 16)), y1; - uint64x2_t x2 = vld1q_u64((const uint64_t*)(buf2 + 32)), y2; - uint64x2_t x3 = vld1q_u64((const uint64_t*)(buf2 + 48)), y3; - uint64x2_t x4 = vld1q_u64((const uint64_t*)(buf2 + 64)), y4; - uint64x2_t x5 = vld1q_u64((const uint64_t*)(buf2 + 80)), y5; - uint64x2_t x6 = vld1q_u64((const uint64_t*)(buf2 + 96)), y6; - uint64x2_t x7 = vld1q_u64((const uint64_t*)(buf2 + 112)), y7; - uint64x2_t x8 = vld1q_u64((const uint64_t*)(buf2 + 128)), y8; - uint64x2_t k; - { static const uint64_t CRC_ALIGN(16) k_[] = {0x26b70c3d, 0x3f41287a}; k = vld1q_u64(k_); } - buf2 += 144; - /* Main loop. */ - while (buf <= limit) { - y0 = clmul_lo_eor3(x0, k), x0 = clmul_hi_eor3(x0, k); - y1 = clmul_lo_eor3(x1, k), x1 = clmul_hi_eor3(x1, k); - y2 = clmul_lo_eor3(x2, k), x2 = clmul_hi_eor3(x2, k); - y3 = clmul_lo_eor3(x3, k), x3 = clmul_hi_eor3(x3, k); - y4 = clmul_lo_eor3(x4, k), x4 = clmul_hi_eor3(x4, k); - y5 = clmul_lo_eor3(x5, k), x5 = clmul_hi_eor3(x5, k); - y6 = clmul_lo_eor3(x6, k), x6 = clmul_hi_eor3(x6, k); - y7 = clmul_lo_eor3(x7, k), x7 = clmul_hi_eor3(x7, k); - y8 = clmul_lo_eor3(x8, k), x8 = clmul_hi_eor3(x8, k); - x0 = veor3q_u64(x0, y0, vld1q_u64((const uint64_t*)buf2)); - x1 = veor3q_u64(x1, y1, vld1q_u64((const uint64_t*)(buf2 + 16))); - x2 = veor3q_u64(x2, y2, vld1q_u64((const uint64_t*)(buf2 + 32))); - x3 = veor3q_u64(x3, y3, vld1q_u64((const uint64_t*)(buf2 + 48))); - x4 = veor3q_u64(x4, y4, vld1q_u64((const uint64_t*)(buf2 + 64))); - x5 = veor3q_u64(x5, y5, vld1q_u64((const uint64_t*)(buf2 + 80))); - x6 = veor3q_u64(x6, y6, vld1q_u64((const uint64_t*)(buf2 + 96))); - x7 = veor3q_u64(x7, y7, vld1q_u64((const uint64_t*)(buf2 + 112))); - x8 = veor3q_u64(x8, y8, vld1q_u64((const uint64_t*)(buf2 + 128))); - crc0 = __crc32d(crc0, *(const uint64_t*)buf); - crc1 = __crc32d(crc1, *(const uint64_t*)(buf + klen)); - crc2 = __crc32d(crc2, *(const uint64_t*)(buf + klen * 2)); - crc0 = __crc32d(crc0, *(const uint64_t*)(buf + 8)); - crc1 = __crc32d(crc1, *(const uint64_t*)(buf + klen + 8)); - crc2 = __crc32d(crc2, *(const uint64_t*)(buf + klen * 2 + 8)); - buf += 16; - buf2 += 144; - } - /* Reduce x0 ... x8 to just x0. */ - { static const uint64_t CRC_ALIGN(16) k_[] = {0xae689191, 0xccaa009e}; k = vld1q_u64(k_); } - y0 = clmul_lo_eor3(x0, k), x0 = clmul_hi_eor3(x0, k); - x0 = veor3q_u64(x0, y0, x1); - x1 = x2, x2 = x3, x3 = x4, x4 = x5, x5 = x6, x6 = x7, x7 = x8; - y0 = clmul_lo_eor3(x0, k), x0 = clmul_hi_eor3(x0, k); - y2 = clmul_lo_eor3(x2, k), x2 = clmul_hi_eor3(x2, k); - y4 = clmul_lo_eor3(x4, k), x4 = clmul_hi_eor3(x4, k); - y6 = clmul_lo_eor3(x6, k), x6 = clmul_hi_eor3(x6, k); - x0 = veor3q_u64(x0, y0, x1); - x2 = veor3q_u64(x2, y2, x3); - x4 = veor3q_u64(x4, y4, x5); - x6 = veor3q_u64(x6, y6, x7); - { static const uint64_t CRC_ALIGN(16) k_[] = {0xf1da05aa, 0x81256527}; k = vld1q_u64(k_); } - y0 = clmul_lo_eor3(x0, k), x0 = clmul_hi_eor3(x0, k); - y4 = clmul_lo_eor3(x4, k), x4 = clmul_hi_eor3(x4, k); - x0 = veor3q_u64(x0, y0, x2); - x4 = veor3q_u64(x4, y4, x6); - { static const uint64_t CRC_ALIGN(16) k_[] = {0x8f352d95, 0x1d9513d7}; k = vld1q_u64(k_); } - y0 = clmul_lo_eor3(x0, k), x0 = clmul_hi_eor3(x0, k); - x0 = veor3q_u64(x0, y0, x4); - /* Final scalar chunk. */ - crc0 = __crc32d(crc0, *(const uint64_t*)buf); - crc1 = __crc32d(crc1, *(const uint64_t*)(buf + klen)); - crc2 = __crc32d(crc2, *(const uint64_t*)(buf + klen * 2)); - crc0 = __crc32d(crc0, *(const uint64_t*)(buf + 8)); - crc1 = __crc32d(crc1, *(const uint64_t*)(buf + klen + 8)); - crc2 = __crc32d(crc2, *(const uint64_t*)(buf + klen * 2 + 8)); - vc0 = crc_shift(crc0, klen * 2 + blk * 144); - vc1 = crc_shift(crc1, klen + blk * 144); - vc2 = crc_shift(crc2, 0 + blk * 144); - vc = vgetq_lane_u64(veor3q_u64(vc0, vc1, vc2), 0); - /* Reduce 128 bits to 32 bits, and multiply by x^32. */ - crc0 = __crc32d(0, vgetq_lane_u64(x0, 0)); - crc0 = __crc32d(crc0, vc ^ vgetq_lane_u64(x0, 1)); - buf = buf2; - len = end - buf; - } - if (len >= 32) { - size_t klen = ((len - 8) / 24) * 8; - uint32_t crc1 = 0; - uint32_t crc2 = 0; - uint64x2_t vc0; - uint64x2_t vc1; - uint64_t vc; - /* Main loop. */ - do { - crc0 = __crc32d(crc0, *(const uint64_t*)buf); - crc1 = __crc32d(crc1, *(const uint64_t*)(buf + klen)); - crc2 = __crc32d(crc2, *(const uint64_t*)(buf + klen * 2)); - buf += 8; - len -= 24; - } while (len >= 32); - vc0 = crc_shift(crc0, klen * 2 + 8); - vc1 = crc_shift(crc1, klen + 8); - vc = vgetq_lane_u64(veorq_u64(vc0, vc1), 0); - /* Final 8 bytes. */ - buf += klen * 2; - crc0 = crc2; - crc0 = __crc32d(crc0, *(const uint64_t*)buf ^ vc), buf += 8; - len -= 8; - } - for (; len >= 8; buf += 8, len -= 8) { - crc0 = __crc32d(crc0, *(const uint64_t*)buf); - } - for (; len; --len) { - crc0 = __crc32b(crc0, *buf++); - } - return ~crc0; -} - -CRC_AINLINE uint64x2_t clmul_lo_e(uint64x2_t a, uint64x2_t b, uint64x2_t c) { - uint64x2_t r; - __asm("pmull %0.1q, %2.1d, %3.1d\neor %0.16b, %0.16b, %1.16b\n" : "=w"(r), "+w"(c) : "w"(a), "w"(b)); - return r; -} - -CRC_AINLINE uint64x2_t clmul_hi_e(uint64x2_t a, uint64x2_t b, uint64x2_t c) { - uint64x2_t r; - __asm("pmull2 %0.1q, %2.2d, %3.2d\neor %0.16b, %0.16b, %1.16b\n" : "=w"(r), "+w"(c) : "w"(a), "w"(b)); - return r; -} - -CRC_AINLINE uint32_t crc32_iso_hdlc_small_impl(uint32_t crc0, const char* buf, size_t len) { - crc0 = ~crc0; - for (; len && ((uintptr_t)buf & 7); --len) { - crc0 = __crc32b(crc0, *buf++); - } - if (((uintptr_t)buf & 8) && len >= 8) { - crc0 = __crc32d(crc0, *(const uint64_t*)buf); - buf += 8; - len -= 8; - } - if (len >= 192) { - const char* end = buf + len; - const char* limit = buf + len - 192; - /* First vector chunk. */ - uint64x2_t x0 = vld1q_u64((const uint64_t*)buf), y0; - uint64x2_t x1 = vld1q_u64((const uint64_t*)(buf + 16)), y1; - uint64x2_t x2 = vld1q_u64((const uint64_t*)(buf + 32)), y2; - uint64x2_t x3 = vld1q_u64((const uint64_t*)(buf + 48)), y3; - uint64x2_t x4 = vld1q_u64((const uint64_t*)(buf + 64)), y4; - uint64x2_t x5 = vld1q_u64((const uint64_t*)(buf + 80)), y5; - uint64x2_t x6 = vld1q_u64((const uint64_t*)(buf + 96)), y6; - uint64x2_t x7 = vld1q_u64((const uint64_t*)(buf + 112)), y7; - uint64x2_t x8 = vld1q_u64((const uint64_t*)(buf + 128)), y8; - uint64x2_t x9 = vld1q_u64((const uint64_t*)(buf + 144)), y9; - uint64x2_t x10 = vld1q_u64((const uint64_t*)(buf + 160)), y10; - uint64x2_t x11 = vld1q_u64((const uint64_t*)(buf + 176)), y11; - uint64x2_t k; - { static const uint64_t CRC_ALIGN(16) k_[] = {0x596c8d81, 0xf5e48c85}; k = vld1q_u64(k_); } - x0 = veorq_u64((uint64x2_t){crc0, 0}, x0); - buf += 192; - /* Main loop. */ - while (buf <= limit) { - y0 = clmul_lo_e(x0, k, vld1q_u64((const uint64_t*)buf)), x0 = clmul_hi_e(x0, k, y0); - y1 = clmul_lo_e(x1, k, vld1q_u64((const uint64_t*)(buf + 16))), x1 = clmul_hi_e(x1, k, y1); - y2 = clmul_lo_e(x2, k, vld1q_u64((const uint64_t*)(buf + 32))), x2 = clmul_hi_e(x2, k, y2); - y3 = clmul_lo_e(x3, k, vld1q_u64((const uint64_t*)(buf + 48))), x3 = clmul_hi_e(x3, k, y3); - y4 = clmul_lo_e(x4, k, vld1q_u64((const uint64_t*)(buf + 64))), x4 = clmul_hi_e(x4, k, y4); - y5 = clmul_lo_e(x5, k, vld1q_u64((const uint64_t*)(buf + 80))), x5 = clmul_hi_e(x5, k, y5); - y6 = clmul_lo_e(x6, k, vld1q_u64((const uint64_t*)(buf + 96))), x6 = clmul_hi_e(x6, k, y6); - y7 = clmul_lo_e(x7, k, vld1q_u64((const uint64_t*)(buf + 112))), x7 = clmul_hi_e(x7, k, y7); - y8 = clmul_lo_e(x8, k, vld1q_u64((const uint64_t*)(buf + 128))), x8 = clmul_hi_e(x8, k, y8); - y9 = clmul_lo_e(x9, k, vld1q_u64((const uint64_t*)(buf + 144))), x9 = clmul_hi_e(x9, k, y9); - y10 = clmul_lo_e(x10, k, vld1q_u64((const uint64_t*)(buf + 160))), x10 = clmul_hi_e(x10, k, y10); - y11 = clmul_lo_e(x11, k, vld1q_u64((const uint64_t*)(buf + 176))), x11 = clmul_hi_e(x11, k, y11); - buf += 192; - } - /* Reduce x0 ... x11 to just x0. */ - { static const uint64_t CRC_ALIGN(16) k_[] = {0xae689191, 0xccaa009e}; k = vld1q_u64(k_); } - y0 = clmul_lo_e(x0, k, x1), x0 = clmul_hi_e(x0, k, y0); - y2 = clmul_lo_e(x2, k, x3), x2 = clmul_hi_e(x2, k, y2); - y4 = clmul_lo_e(x4, k, x5), x4 = clmul_hi_e(x4, k, y4); - y6 = clmul_lo_e(x6, k, x7), x6 = clmul_hi_e(x6, k, y6); - y8 = clmul_lo_e(x8, k, x9), x8 = clmul_hi_e(x8, k, y8); - y10 = clmul_lo_e(x10, k, x11), x10 = clmul_hi_e(x10, k, y10); - { static const uint64_t CRC_ALIGN(16) k_[] = {0xf1da05aa, 0x81256527}; k = vld1q_u64(k_); } - y0 = clmul_lo_e(x0, k, x2), x0 = clmul_hi_e(x0, k, y0); - y4 = clmul_lo_e(x4, k, x6), x4 = clmul_hi_e(x4, k, y4); - y8 = clmul_lo_e(x8, k, x10), x8 = clmul_hi_e(x8, k, y8); - { static const uint64_t CRC_ALIGN(16) k_[] = {0x8f352d95, 0x1d9513d7}; k = vld1q_u64(k_); } - y0 = clmul_lo_e(x0, k, x4), x0 = clmul_hi_e(x0, k, y0); - x4 = x8; - y0 = clmul_lo_e(x0, k, x4), x0 = clmul_hi_e(x0, k, y0); - /* Reduce 128 bits to 32 bits, and multiply by x^32. */ - crc0 = __crc32d(0, vgetq_lane_u64(x0, 0)); - crc0 = __crc32d(crc0, vgetq_lane_u64(x0, 1)); - len = end - buf; - } - if (len >= 16) { - /* First vector chunk. */ - uint64x2_t x0 = vld1q_u64((const uint64_t*)buf), y0; - uint64x2_t k; - { static const uint64_t CRC_ALIGN(16) k_[] = {0xae689191, 0xccaa009e}; k = vld1q_u64(k_); } - x0 = veorq_u64((uint64x2_t){crc0, 0}, x0); - buf += 16; - len -= 16; - /* Main loop. */ - while (len >= 16) { - y0 = clmul_lo_e(x0, k, vld1q_u64((const uint64_t*)buf)), x0 = clmul_hi_e(x0, k, y0); - buf += 16; - len -= 16; - } - /* Reduce 128 bits to 32 bits, and multiply by x^32. */ - crc0 = __crc32d(0, vgetq_lane_u64(x0, 0)); - crc0 = __crc32d(crc0, vgetq_lane_u64(x0, 1)); - } - for (; len >= 8; buf += 8, len -= 8) { - crc0 = __crc32d(crc0, *(const uint64_t*)buf); - } - for (; len; --len) { - crc0 = __crc32b(crc0, *buf++); - } - return ~crc0; -} - -CRC_EXPORT uint32_t crc32_iso_hdlc_impl(uint32_t crc0, const char* buf, size_t len) { - // Define 1 KiB threshold (1024 bytes) - const size_t LARGE_BUFFER_THRESHOLD = 1024; - - // Select implementation based on buffer size - if (len <= LARGE_BUFFER_THRESHOLD) { - return crc32_iso_hdlc_small_impl(crc0, buf, len); - } else { - return crc32_iso_hdlc_large_impl(crc0, buf, len); - } -} - diff --git a/include/crc32_iso_hdlc_neon_eor3_v9s3x2e_s3.c b/include/crc32_iso_hdlc_neon_eor3_v9s3x2e_s3.c deleted file mode 100644 index 8197fb3..0000000 --- a/include/crc32_iso_hdlc_neon_eor3_v9s3x2e_s3.c +++ /dev/null @@ -1,200 +0,0 @@ -/* Generated by https://github.com/corsix/fast-crc32/ using: */ -/* ./generate -i neon_eor3 -p crc32 -a v9s3x2e_s3 */ -/* Modified slightly post-generation to improve function name and include build target */ -/* MIT licensed */ - -#include -#include -#include -#include - -#if defined(_MSC_VER) -#define CRC_AINLINE static __forceinline -#define CRC_ALIGN(n) __declspec(align(n)) -#else -#define CRC_AINLINE static __inline __attribute__((always_inline)) -#define CRC_ALIGN(n) __attribute__((aligned(n))) -#endif -#define CRC_EXPORT extern - -const char *const ISO_HDLC_TARGET = "aarch64_neon_eor3_v9s3x2e_s3"; - -const char *get_iso_hdlc_target() { - return ISO_HDLC_TARGET; -} - -CRC_AINLINE uint64x2_t clmul_lo(uint64x2_t a, uint64x2_t b) { - uint64x2_t r; - __asm("pmull %0.1q, %1.1d, %2.1d\n" : "=w"(r) : "w"(a), "w"(b)); - return r; -} - -CRC_AINLINE uint64x2_t clmul_hi(uint64x2_t a, uint64x2_t b) { - uint64x2_t r; - __asm("pmull2 %0.1q, %1.2d, %2.2d\n" : "=w"(r) : "w"(a), "w"(b)); - return r; -} - -CRC_AINLINE uint64x2_t clmul_scalar(uint32_t a, uint32_t b) { - uint64x2_t r; - __asm("pmull %0.1q, %1.1d, %2.1d\n" : "=w"(r) : "w"(vmovq_n_u64(a)), "w"(vmovq_n_u64(b))); - return r; -} - -static uint32_t xnmodp(uint64_t n) /* x^n mod P, in log(n) time */ { - uint64_t stack = ~(uint64_t)1; - uint32_t acc, low; - for (; n > 191; n = (n >> 1) - 16) { - stack = (stack << 1) + (n & 1); - } - stack = ~stack; - acc = ((uint32_t)0x80000000) >> (n & 31); - for (n >>= 5; n; --n) { - acc = __crc32w(acc, 0); - } - while ((low = stack & 1), stack >>= 1) { - poly8x8_t x = vreinterpret_p8_u64(vmov_n_u64(acc)); - uint64_t y = vgetq_lane_u64(vreinterpretq_u64_p16(vmull_p8(x, x)), 0); - acc = __crc32d(0, y << low); - } - return acc; -} - -CRC_AINLINE uint64x2_t crc_shift(uint32_t crc, size_t nbytes) { - return clmul_scalar(crc, xnmodp(nbytes * 8 - 33)); -} - -CRC_EXPORT uint32_t crc32_iso_hdlc_impl(uint32_t crc0, const char* buf, size_t len) { - crc0 = ~crc0; - for (; len && ((uintptr_t)buf & 7); --len) { - crc0 = __crc32b(crc0, *buf++); - } - if (((uintptr_t)buf & 8) && len >= 8) { - crc0 = __crc32d(crc0, *(const uint64_t*)buf); - buf += 8; - len -= 8; - } - if (len >= 192) { - const char* end = buf + len; - size_t blk = (len - 0) / 192; - size_t klen = blk * 16; - const char* buf2 = buf + klen * 3; - const char* limit = buf + klen - 32; - uint32_t crc1 = 0; - uint32_t crc2 = 0; - uint64x2_t vc0; - uint64x2_t vc1; - uint64x2_t vc2; - uint64_t vc; - /* First vector chunk. */ - uint64x2_t x0 = vld1q_u64((const uint64_t*)buf2), y0; - uint64x2_t x1 = vld1q_u64((const uint64_t*)(buf2 + 16)), y1; - uint64x2_t x2 = vld1q_u64((const uint64_t*)(buf2 + 32)), y2; - uint64x2_t x3 = vld1q_u64((const uint64_t*)(buf2 + 48)), y3; - uint64x2_t x4 = vld1q_u64((const uint64_t*)(buf2 + 64)), y4; - uint64x2_t x5 = vld1q_u64((const uint64_t*)(buf2 + 80)), y5; - uint64x2_t x6 = vld1q_u64((const uint64_t*)(buf2 + 96)), y6; - uint64x2_t x7 = vld1q_u64((const uint64_t*)(buf2 + 112)), y7; - uint64x2_t x8 = vld1q_u64((const uint64_t*)(buf2 + 128)), y8; - uint64x2_t k; - { static const uint64_t CRC_ALIGN(16) k_[] = {0x26b70c3d, 0x3f41287a}; k = vld1q_u64(k_); } - buf2 += 144; - /* Main loop. */ - while (buf <= limit) { - y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k); - y1 = clmul_lo(x1, k), x1 = clmul_hi(x1, k); - y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k); - y3 = clmul_lo(x3, k), x3 = clmul_hi(x3, k); - y4 = clmul_lo(x4, k), x4 = clmul_hi(x4, k); - y5 = clmul_lo(x5, k), x5 = clmul_hi(x5, k); - y6 = clmul_lo(x6, k), x6 = clmul_hi(x6, k); - y7 = clmul_lo(x7, k), x7 = clmul_hi(x7, k); - y8 = clmul_lo(x8, k), x8 = clmul_hi(x8, k); - x0 = veor3q_u64(x0, y0, vld1q_u64((const uint64_t*)buf2)); - x1 = veor3q_u64(x1, y1, vld1q_u64((const uint64_t*)(buf2 + 16))); - x2 = veor3q_u64(x2, y2, vld1q_u64((const uint64_t*)(buf2 + 32))); - x3 = veor3q_u64(x3, y3, vld1q_u64((const uint64_t*)(buf2 + 48))); - x4 = veor3q_u64(x4, y4, vld1q_u64((const uint64_t*)(buf2 + 64))); - x5 = veor3q_u64(x5, y5, vld1q_u64((const uint64_t*)(buf2 + 80))); - x6 = veor3q_u64(x6, y6, vld1q_u64((const uint64_t*)(buf2 + 96))); - x7 = veor3q_u64(x7, y7, vld1q_u64((const uint64_t*)(buf2 + 112))); - x8 = veor3q_u64(x8, y8, vld1q_u64((const uint64_t*)(buf2 + 128))); - crc0 = __crc32d(crc0, *(const uint64_t*)buf); - crc1 = __crc32d(crc1, *(const uint64_t*)(buf + klen)); - crc2 = __crc32d(crc2, *(const uint64_t*)(buf + klen * 2)); - crc0 = __crc32d(crc0, *(const uint64_t*)(buf + 8)); - crc1 = __crc32d(crc1, *(const uint64_t*)(buf + klen + 8)); - crc2 = __crc32d(crc2, *(const uint64_t*)(buf + klen * 2 + 8)); - buf += 16; - buf2 += 144; - } - /* Reduce x0 ... x8 to just x0. */ - { static const uint64_t CRC_ALIGN(16) k_[] = {0xae689191, 0xccaa009e}; k = vld1q_u64(k_); } - y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k); - x0 = veor3q_u64(x0, y0, x1); - x1 = x2, x2 = x3, x3 = x4, x4 = x5, x5 = x6, x6 = x7, x7 = x8; - y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k); - y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k); - y4 = clmul_lo(x4, k), x4 = clmul_hi(x4, k); - y6 = clmul_lo(x6, k), x6 = clmul_hi(x6, k); - x0 = veor3q_u64(x0, y0, x1); - x2 = veor3q_u64(x2, y2, x3); - x4 = veor3q_u64(x4, y4, x5); - x6 = veor3q_u64(x6, y6, x7); - { static const uint64_t CRC_ALIGN(16) k_[] = {0xf1da05aa, 0x81256527}; k = vld1q_u64(k_); } - y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k); - y4 = clmul_lo(x4, k), x4 = clmul_hi(x4, k); - x0 = veor3q_u64(x0, y0, x2); - x4 = veor3q_u64(x4, y4, x6); - { static const uint64_t CRC_ALIGN(16) k_[] = {0x8f352d95, 0x1d9513d7}; k = vld1q_u64(k_); } - y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k); - x0 = veor3q_u64(x0, y0, x4); - /* Final scalar chunk. */ - crc0 = __crc32d(crc0, *(const uint64_t*)buf); - crc1 = __crc32d(crc1, *(const uint64_t*)(buf + klen)); - crc2 = __crc32d(crc2, *(const uint64_t*)(buf + klen * 2)); - crc0 = __crc32d(crc0, *(const uint64_t*)(buf + 8)); - crc1 = __crc32d(crc1, *(const uint64_t*)(buf + klen + 8)); - crc2 = __crc32d(crc2, *(const uint64_t*)(buf + klen * 2 + 8)); - vc0 = crc_shift(crc0, klen * 2 + blk * 144); - vc1 = crc_shift(crc1, klen + blk * 144); - vc2 = crc_shift(crc2, 0 + blk * 144); - vc = vgetq_lane_u64(veor3q_u64(vc0, vc1, vc2), 0); - /* Reduce 128 bits to 32 bits, and multiply by x^32. */ - crc0 = __crc32d(0, vgetq_lane_u64(x0, 0)); - crc0 = __crc32d(crc0, vc ^ vgetq_lane_u64(x0, 1)); - buf = buf2; - len = end - buf; - } - if (len >= 32) { - size_t klen = ((len - 8) / 24) * 8; - uint32_t crc1 = 0; - uint32_t crc2 = 0; - uint64x2_t vc0; - uint64x2_t vc1; - uint64_t vc; - /* Main loop. */ - do { - crc0 = __crc32d(crc0, *(const uint64_t*)buf); - crc1 = __crc32d(crc1, *(const uint64_t*)(buf + klen)); - crc2 = __crc32d(crc2, *(const uint64_t*)(buf + klen * 2)); - buf += 8; - len -= 24; - } while (len >= 32); - vc0 = crc_shift(crc0, klen * 2 + 8); - vc1 = crc_shift(crc1, klen + 8); - vc = vgetq_lane_u64(veorq_u64(vc0, vc1), 0); - /* Final 8 bytes. */ - buf += klen * 2; - crc0 = crc2; - crc0 = __crc32d(crc0, *(const uint64_t*)buf ^ vc), buf += 8; - len -= 8; - } - for (; len >= 8; buf += 8, len -= 8) { - crc0 = __crc32d(crc0, *(const uint64_t*)buf); - } - for (; len; --len) { - crc0 = __crc32b(crc0, *buf++); - } - return ~crc0; -} diff --git a/include/crc32_iso_hdlc_neon_v12e_v1.c b/include/crc32_iso_hdlc_neon_v12e_v1.c deleted file mode 100644 index 8ddafa6..0000000 --- a/include/crc32_iso_hdlc_neon_v12e_v1.c +++ /dev/null @@ -1,130 +0,0 @@ -/* Generated by https://github.com/corsix/fast-crc32/ using: */ -/* ./generate -i neon -p crc32 -a v12e_v1 */ -/* Modified slightly post-generation to improve function name and include build target */ -/* MIT licensed */ - -#include -#include -#include -#include - -#if defined(_MSC_VER) -#define CRC_AINLINE static __forceinline -#define CRC_ALIGN(n) __declspec(align(n)) -#else -#define CRC_AINLINE static __inline __attribute__((always_inline)) -#define CRC_ALIGN(n) __attribute__((aligned(n))) -#endif -#define CRC_EXPORT extern - -const char *const ISO_HDLC_TARGET = "aarch64_neon_v12e_v1"; - -const char *get_iso_hdlc_target() { - return ISO_HDLC_TARGET; -} - -CRC_AINLINE uint64x2_t clmul_lo_e(uint64x2_t a, uint64x2_t b, uint64x2_t c) { - uint64x2_t r; - __asm("pmull %0.1q, %2.1d, %3.1d\neor %0.16b, %0.16b, %1.16b\n" : "=w"(r), "+w"(c) : "w"(a), "w"(b)); - return r; -} - -CRC_AINLINE uint64x2_t clmul_hi_e(uint64x2_t a, uint64x2_t b, uint64x2_t c) { - uint64x2_t r; - __asm("pmull2 %0.1q, %2.2d, %3.2d\neor %0.16b, %0.16b, %1.16b\n" : "=w"(r), "+w"(c) : "w"(a), "w"(b)); - return r; -} - -CRC_EXPORT uint32_t crc32_iso_hdlc_impl(uint32_t crc0, const char* buf, size_t len) { - crc0 = ~crc0; - for (; len && ((uintptr_t)buf & 7); --len) { - crc0 = __crc32b(crc0, *buf++); - } - if (((uintptr_t)buf & 8) && len >= 8) { - crc0 = __crc32d(crc0, *(const uint64_t*)buf); - buf += 8; - len -= 8; - } - if (len >= 192) { - const char* end = buf + len; - const char* limit = buf + len - 192; - /* First vector chunk. */ - uint64x2_t x0 = vld1q_u64((const uint64_t*)buf), y0; - uint64x2_t x1 = vld1q_u64((const uint64_t*)(buf + 16)), y1; - uint64x2_t x2 = vld1q_u64((const uint64_t*)(buf + 32)), y2; - uint64x2_t x3 = vld1q_u64((const uint64_t*)(buf + 48)), y3; - uint64x2_t x4 = vld1q_u64((const uint64_t*)(buf + 64)), y4; - uint64x2_t x5 = vld1q_u64((const uint64_t*)(buf + 80)), y5; - uint64x2_t x6 = vld1q_u64((const uint64_t*)(buf + 96)), y6; - uint64x2_t x7 = vld1q_u64((const uint64_t*)(buf + 112)), y7; - uint64x2_t x8 = vld1q_u64((const uint64_t*)(buf + 128)), y8; - uint64x2_t x9 = vld1q_u64((const uint64_t*)(buf + 144)), y9; - uint64x2_t x10 = vld1q_u64((const uint64_t*)(buf + 160)), y10; - uint64x2_t x11 = vld1q_u64((const uint64_t*)(buf + 176)), y11; - uint64x2_t k; - { static const uint64_t CRC_ALIGN(16) k_[] = {0x596c8d81, 0xf5e48c85}; k = vld1q_u64(k_); } - x0 = veorq_u64((uint64x2_t){crc0, 0}, x0); - buf += 192; - /* Main loop. */ - while (buf <= limit) { - y0 = clmul_lo_e(x0, k, vld1q_u64((const uint64_t*)buf)), x0 = clmul_hi_e(x0, k, y0); - y1 = clmul_lo_e(x1, k, vld1q_u64((const uint64_t*)(buf + 16))), x1 = clmul_hi_e(x1, k, y1); - y2 = clmul_lo_e(x2, k, vld1q_u64((const uint64_t*)(buf + 32))), x2 = clmul_hi_e(x2, k, y2); - y3 = clmul_lo_e(x3, k, vld1q_u64((const uint64_t*)(buf + 48))), x3 = clmul_hi_e(x3, k, y3); - y4 = clmul_lo_e(x4, k, vld1q_u64((const uint64_t*)(buf + 64))), x4 = clmul_hi_e(x4, k, y4); - y5 = clmul_lo_e(x5, k, vld1q_u64((const uint64_t*)(buf + 80))), x5 = clmul_hi_e(x5, k, y5); - y6 = clmul_lo_e(x6, k, vld1q_u64((const uint64_t*)(buf + 96))), x6 = clmul_hi_e(x6, k, y6); - y7 = clmul_lo_e(x7, k, vld1q_u64((const uint64_t*)(buf + 112))), x7 = clmul_hi_e(x7, k, y7); - y8 = clmul_lo_e(x8, k, vld1q_u64((const uint64_t*)(buf + 128))), x8 = clmul_hi_e(x8, k, y8); - y9 = clmul_lo_e(x9, k, vld1q_u64((const uint64_t*)(buf + 144))), x9 = clmul_hi_e(x9, k, y9); - y10 = clmul_lo_e(x10, k, vld1q_u64((const uint64_t*)(buf + 160))), x10 = clmul_hi_e(x10, k, y10); - y11 = clmul_lo_e(x11, k, vld1q_u64((const uint64_t*)(buf + 176))), x11 = clmul_hi_e(x11, k, y11); - buf += 192; - } - /* Reduce x0 ... x11 to just x0. */ - { static const uint64_t CRC_ALIGN(16) k_[] = {0xae689191, 0xccaa009e}; k = vld1q_u64(k_); } - y0 = clmul_lo_e(x0, k, x1), x0 = clmul_hi_e(x0, k, y0); - y2 = clmul_lo_e(x2, k, x3), x2 = clmul_hi_e(x2, k, y2); - y4 = clmul_lo_e(x4, k, x5), x4 = clmul_hi_e(x4, k, y4); - y6 = clmul_lo_e(x6, k, x7), x6 = clmul_hi_e(x6, k, y6); - y8 = clmul_lo_e(x8, k, x9), x8 = clmul_hi_e(x8, k, y8); - y10 = clmul_lo_e(x10, k, x11), x10 = clmul_hi_e(x10, k, y10); - { static const uint64_t CRC_ALIGN(16) k_[] = {0xf1da05aa, 0x81256527}; k = vld1q_u64(k_); } - y0 = clmul_lo_e(x0, k, x2), x0 = clmul_hi_e(x0, k, y0); - y4 = clmul_lo_e(x4, k, x6), x4 = clmul_hi_e(x4, k, y4); - y8 = clmul_lo_e(x8, k, x10), x8 = clmul_hi_e(x8, k, y8); - { static const uint64_t CRC_ALIGN(16) k_[] = {0x8f352d95, 0x1d9513d7}; k = vld1q_u64(k_); } - y0 = clmul_lo_e(x0, k, x4), x0 = clmul_hi_e(x0, k, y0); - x4 = x8; - y0 = clmul_lo_e(x0, k, x4), x0 = clmul_hi_e(x0, k, y0); - /* Reduce 128 bits to 32 bits, and multiply by x^32. */ - crc0 = __crc32d(0, vgetq_lane_u64(x0, 0)); - crc0 = __crc32d(crc0, vgetq_lane_u64(x0, 1)); - len = end - buf; - } - if (len >= 16) { - /* First vector chunk. */ - uint64x2_t x0 = vld1q_u64((const uint64_t*)buf), y0; - uint64x2_t k; - { static const uint64_t CRC_ALIGN(16) k_[] = {0xae689191, 0xccaa009e}; k = vld1q_u64(k_); } - x0 = veorq_u64((uint64x2_t){crc0, 0}, x0); - buf += 16; - len -= 16; - /* Main loop. */ - while (len >= 16) { - y0 = clmul_lo_e(x0, k, vld1q_u64((const uint64_t*)buf)), x0 = clmul_hi_e(x0, k, y0); - buf += 16; - len -= 16; - } - /* Reduce 128 bits to 32 bits, and multiply by x^32. */ - crc0 = __crc32d(0, vgetq_lane_u64(x0, 0)); - crc0 = __crc32d(crc0, vgetq_lane_u64(x0, 1)); - } - for (; len >= 8; buf += 8, len -= 8) { - crc0 = __crc32d(crc0, *(const uint64_t*)buf); - } - for (; len; --len) { - crc0 = __crc32b(crc0, *buf++); - } - return ~crc0; -} diff --git a/include/crc32_iso_hdlc_neon_v3s4x2e_v2.c b/include/crc32_iso_hdlc_neon_v3s4x2e_v2.c deleted file mode 100644 index 5e23963..0000000 --- a/include/crc32_iso_hdlc_neon_v3s4x2e_v2.c +++ /dev/null @@ -1,169 +0,0 @@ -/* Generated by https://github.com/corsix/fast-crc32/ using: */ -/* ./generate -i neon -p crc32 -a v3s4x2e_v2 */ -/* Modified slightly post-generation to improve function name and include build target */ -/* MIT licensed */ - -#include -#include -#include -#include - -#if defined(_MSC_VER) -#define CRC_AINLINE static __forceinline -#define CRC_ALIGN(n) __declspec(align(n)) -#else -#define CRC_AINLINE static __inline __attribute__((always_inline)) -#define CRC_ALIGN(n) __attribute__((aligned(n))) -#endif -#define CRC_EXPORT extern - -const char *const ISO_HDLC_TARGET = "aarch64_neon_v3s4x2e_v2"; - -const char *get_iso_hdlc_target() { - return ISO_HDLC_TARGET; -} - -CRC_AINLINE uint64x2_t clmul_lo_e(uint64x2_t a, uint64x2_t b, uint64x2_t c) { - uint64x2_t r; - __asm("pmull %0.1q, %2.1d, %3.1d\neor %0.16b, %0.16b, %1.16b\n" : "=w"(r), "+w"(c) : "w"(a), "w"(b)); - return r; -} - -CRC_AINLINE uint64x2_t clmul_hi_e(uint64x2_t a, uint64x2_t b, uint64x2_t c) { - uint64x2_t r; - __asm("pmull2 %0.1q, %2.2d, %3.2d\neor %0.16b, %0.16b, %1.16b\n" : "=w"(r), "+w"(c) : "w"(a), "w"(b)); - return r; -} - -CRC_AINLINE uint64x2_t clmul_scalar(uint32_t a, uint32_t b) { - uint64x2_t r; - __asm("pmull %0.1q, %1.1d, %2.1d\n" : "=w"(r) : "w"(vmovq_n_u64(a)), "w"(vmovq_n_u64(b))); - return r; -} - -static uint32_t xnmodp(uint64_t n) /* x^n mod P, in log(n) time */ { - uint64_t stack = ~(uint64_t)1; - uint32_t acc, low; - for (; n > 191; n = (n >> 1) - 16) { - stack = (stack << 1) + (n & 1); - } - stack = ~stack; - acc = ((uint32_t)0x80000000) >> (n & 31); - for (n >>= 5; n; --n) { - acc = __crc32w(acc, 0); - } - while ((low = stack & 1), stack >>= 1) { - poly8x8_t x = vreinterpret_p8_u64(vmov_n_u64(acc)); - uint64_t y = vgetq_lane_u64(vreinterpretq_u64_p16(vmull_p8(x, x)), 0); - acc = __crc32d(0, y << low); - } - return acc; -} - -CRC_AINLINE uint64x2_t crc_shift(uint32_t crc, size_t nbytes) { - return clmul_scalar(crc, xnmodp(nbytes * 8 - 33)); -} - -CRC_EXPORT uint32_t crc32_iso_hdlc_impl(uint32_t crc0, const char* buf, size_t len) { - crc0 = ~crc0; - for (; len && ((uintptr_t)buf & 7); --len) { - crc0 = __crc32b(crc0, *buf++); - } - if (((uintptr_t)buf & 8) && len >= 8) { - crc0 = __crc32d(crc0, *(const uint64_t*)buf); - buf += 8; - len -= 8; - } - if (len >= 112) { - const char* end = buf + len; - size_t blk = (len - 0) / 112; - size_t klen = blk * 16; - const char* buf2 = buf + klen * 4; - const char* limit = buf + klen - 32; - uint32_t crc1 = 0; - uint32_t crc2 = 0; - uint32_t crc3 = 0; - uint64x2_t vc0; - uint64x2_t vc1; - uint64x2_t vc2; - uint64x2_t vc3; - uint64_t vc; - /* First vector chunk. */ - uint64x2_t x0 = vld1q_u64((const uint64_t*)buf2), y0; - uint64x2_t x1 = vld1q_u64((const uint64_t*)(buf2 + 16)), y1; - uint64x2_t x2 = vld1q_u64((const uint64_t*)(buf2 + 32)), y2; - uint64x2_t k; - { static const uint64_t CRC_ALIGN(16) k_[] = {0x3db1ecdc, 0xaf449247}; k = vld1q_u64(k_); } - buf2 += 48; - /* Main loop. */ - while (buf <= limit) { - y0 = clmul_lo_e(x0, k, vld1q_u64((const uint64_t*)buf2)), x0 = clmul_hi_e(x0, k, y0); - y1 = clmul_lo_e(x1, k, vld1q_u64((const uint64_t*)(buf2 + 16))), x1 = clmul_hi_e(x1, k, y1); - y2 = clmul_lo_e(x2, k, vld1q_u64((const uint64_t*)(buf2 + 32))), x2 = clmul_hi_e(x2, k, y2); - crc0 = __crc32d(crc0, *(const uint64_t*)buf); - crc1 = __crc32d(crc1, *(const uint64_t*)(buf + klen)); - crc2 = __crc32d(crc2, *(const uint64_t*)(buf + klen * 2)); - crc3 = __crc32d(crc3, *(const uint64_t*)(buf + klen * 3)); - crc0 = __crc32d(crc0, *(const uint64_t*)(buf + 8)); - crc1 = __crc32d(crc1, *(const uint64_t*)(buf + klen + 8)); - crc2 = __crc32d(crc2, *(const uint64_t*)(buf + klen * 2 + 8)); - crc3 = __crc32d(crc3, *(const uint64_t*)(buf + klen * 3 + 8)); - buf += 16; - buf2 += 48; - } - /* Reduce x0 ... x2 to just x0. */ - { static const uint64_t CRC_ALIGN(16) k_[] = {0xae689191, 0xccaa009e}; k = vld1q_u64(k_); } - y0 = clmul_lo_e(x0, k, x1), x0 = clmul_hi_e(x0, k, y0); - x1 = x2; - y0 = clmul_lo_e(x0, k, x1), x0 = clmul_hi_e(x0, k, y0); - /* Final scalar chunk. */ - crc0 = __crc32d(crc0, *(const uint64_t*)buf); - crc1 = __crc32d(crc1, *(const uint64_t*)(buf + klen)); - crc2 = __crc32d(crc2, *(const uint64_t*)(buf + klen * 2)); - crc3 = __crc32d(crc3, *(const uint64_t*)(buf + klen * 3)); - crc0 = __crc32d(crc0, *(const uint64_t*)(buf + 8)); - crc1 = __crc32d(crc1, *(const uint64_t*)(buf + klen + 8)); - crc2 = __crc32d(crc2, *(const uint64_t*)(buf + klen * 2 + 8)); - crc3 = __crc32d(crc3, *(const uint64_t*)(buf + klen * 3 + 8)); - vc0 = crc_shift(crc0, klen * 3 + blk * 48); - vc1 = crc_shift(crc1, klen * 2 + blk * 48); - vc2 = crc_shift(crc2, klen + blk * 48); - vc3 = crc_shift(crc3, 0 + blk * 48); - vc = vgetq_lane_u64(veorq_u64(veorq_u64(vc0, vc1), veorq_u64(vc2, vc3)), 0); - /* Reduce 128 bits to 32 bits, and multiply by x^32. */ - crc0 = __crc32d(0, vgetq_lane_u64(x0, 0)); - crc0 = __crc32d(crc0, vc ^ vgetq_lane_u64(x0, 1)); - buf = buf2; - len = end - buf; - } - if (len >= 32) { - /* First vector chunk. */ - uint64x2_t x0 = vld1q_u64((const uint64_t*)buf), y0; - uint64x2_t x1 = vld1q_u64((const uint64_t*)(buf + 16)), y1; - uint64x2_t k; - { static const uint64_t CRC_ALIGN(16) k_[] = {0xf1da05aa, 0x81256527}; k = vld1q_u64(k_); } - x0 = veorq_u64((uint64x2_t){crc0, 0}, x0); - buf += 32; - len -= 32; - /* Main loop. */ - while (len >= 32) { - y0 = clmul_lo_e(x0, k, vld1q_u64((const uint64_t*)buf)), x0 = clmul_hi_e(x0, k, y0); - y1 = clmul_lo_e(x1, k, vld1q_u64((const uint64_t*)(buf + 16))), x1 = clmul_hi_e(x1, k, y1); - buf += 32; - len -= 32; - } - /* Reduce x0 ... x1 to just x0. */ - { static const uint64_t CRC_ALIGN(16) k_[] = {0xae689191, 0xccaa009e}; k = vld1q_u64(k_); } - y0 = clmul_lo_e(x0, k, x1), x0 = clmul_hi_e(x0, k, y0); - /* Reduce 128 bits to 32 bits, and multiply by x^32. */ - crc0 = __crc32d(0, vgetq_lane_u64(x0, 0)); - crc0 = __crc32d(crc0, vgetq_lane_u64(x0, 1)); - } - for (; len >= 8; buf += 8, len -= 8) { - crc0 = __crc32d(crc0, *(const uint64_t*)buf); - } - for (; len; --len) { - crc0 = __crc32b(crc0, *buf++); - } - return ~crc0; -} diff --git a/include/crc32_iso_hdlc_sse_v4s3x3.c b/include/crc32_iso_hdlc_sse_v4s3x3.c deleted file mode 100644 index eaa4e60..0000000 --- a/include/crc32_iso_hdlc_sse_v4s3x3.c +++ /dev/null @@ -1,278 +0,0 @@ -/* Generated by https://github.com/corsix/fast-crc32/ using: */ -/* ./generate -i sse -p crc32 -a v4s3x3 */ -/* Modified slightly post-generation to improve function name and include build target */ -/* MIT licensed */ -/* Modified for 32-bit compatibility */ - -#include -#include -#include -#include - -#if defined(_MSC_VER) -#define CRC_AINLINE static __forceinline -#define CRC_ALIGN(n) __declspec(align(n)) -#else -#define CRC_AINLINE static __inline __attribute__((always_inline)) -#define CRC_ALIGN(n) __attribute__((aligned(n))) -#endif -#define CRC_EXPORT extern - -static const uint32_t g_crc_table[1][256] = {{ - 0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x076dc419, 0x706af48f, - 0xe963a535, 0x9e6495a3, 0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988, - 0x09b64c2b, 0x7eb17cbd, 0xe7b82d07, 0x90bf1d91, 0x1db71064, 0x6ab020f2, - 0xf3b97148, 0x84be41de, 0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7, - 0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec, 0x14015c4f, 0x63066cd9, - 0xfa0f3d63, 0x8d080df5, 0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172, - 0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b, 0x35b5a8fa, 0x42b2986c, - 0xdbbbc9d6, 0xacbcf940, 0x32d86ce3, 0x45df5c75, 0xdcd60dcf, 0xabd13d59, - 0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116, 0x21b4f4b5, 0x56b3c423, - 0xcfba9599, 0xb8bda50f, 0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924, - 0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d, 0x76dc4190, 0x01db7106, - 0x98d220bc, 0xefd5102a, 0x71b18589, 0x06b6b51f, 0x9fbfe4a5, 0xe8b8d433, - 0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818, 0x7f6a0dbb, 0x086d3d2d, - 0x91646c97, 0xe6635c01, 0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e, - 0x6c0695ed, 0x1b01a57b, 0x8208f4c1, 0xf50fc457, 0x65b0d9c6, 0x12b7e950, - 0x8bbeb8ea, 0xfcb9887c, 0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, 0xfbd44c65, - 0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2, 0x4adfa541, 0x3dd895d7, - 0xa4d1c46d, 0xd3d6f4fb, 0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0, - 0x44042d73, 0x33031de5, 0xaa0a4c5f, 0xdd0d7cc9, 0x5005713c, 0x270241aa, - 0xbe0b1010, 0xc90c2086, 0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f, - 0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, 0x59b33d17, 0x2eb40d81, - 0xb7bd5c3b, 0xc0ba6cad, 0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a, - 0xead54739, 0x9dd277af, 0x04db2615, 0x73dc1683, 0xe3630b12, 0x94643b84, - 0x0d6d6a3e, 0x7a6a5aa8, 0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1, - 0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe, 0xf762575d, 0x806567cb, - 0x196c3671, 0x6e6b06e7, 0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc, - 0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5, 0xd6d6a3e8, 0xa1d1937e, - 0x38d8c2c4, 0x4fdff252, 0xd1bb67f1, 0xa6bc5767, 0x3fb506dd, 0x48b2364b, - 0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60, 0xdf60efc3, 0xa867df55, - 0x316e8eef, 0x4669be79, 0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236, - 0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f, 0xc5ba3bbe, 0xb2bd0b28, - 0x2bb45a92, 0x5cb36a04, 0xc2d7ffa7, 0xb5d0cf31, 0x2cd99e8b, 0x5bdeae1d, - 0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a, 0x9c0906a9, 0xeb0e363f, - 0x72076785, 0x05005713, 0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38, - 0x92d28e9b, 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21, 0x86d3d2d4, 0xf1d4e242, - 0x68ddb3f8, 0x1fda836e, 0x81be16cd, 0xf6b9265b, 0x6fb077e1, 0x18b74777, - 0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c, 0x8f659eff, 0xf862ae69, - 0x616bffd3, 0x166ccf45, 0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2, - 0xa7672661, 0xd06016f7, 0x4969474d, 0x3e6e77db, 0xaed16a4a, 0xd9d65adc, - 0x40df0b66, 0x37d83bf0, 0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9, - 0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, 0xbad03605, 0xcdd70693, - 0x54de5729, 0x23d967bf, 0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94, - 0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d -}}; - -const char *const ISO_HDLC_TARGET = "x86_sse_v4s3x3"; - -const char *get_iso_hdlc_target() { - return ISO_HDLC_TARGET; -} - -CRC_AINLINE uint32_t crc_u8(uint32_t crc, uint8_t val) { - return (crc >> 8) ^ g_crc_table[0][(crc & 0xFF) ^ val]; -} - -/* Platform-specific 64-bit handling */ -#if defined(__x86_64__) || defined(_M_X64) -/* 64-bit platform */ -CRC_AINLINE __m128i mm_cvtsi64_si128(uint64_t val) { - return _mm_cvtsi64_si128(val); -} - -CRC_AINLINE uint64_t mm_cvtsi128_si64(__m128i val) { - return _mm_cvtsi128_si64(val); -} - -CRC_AINLINE uint64_t mm_extract_epi64(__m128i val, int idx) { - /* Even on 64-bit platforms, we need to use constant indices */ - if (idx == 0) { - return _mm_cvtsi128_si64(val); - } else { - /* For the high 64 bits */ - return _mm_cvtsi128_si64(_mm_srli_si128(val, 8)); - } -} -#else -/* 32-bit platform */ -CRC_AINLINE __m128i mm_cvtsi64_si128(uint64_t val) { - /* Split 64-bit value into two 32-bit parts for 32-bit platform */ - __m128i result, temp; - result = _mm_cvtsi32_si128((uint32_t)val); /* Low 32 bits */ - temp = _mm_cvtsi32_si128((uint32_t)(val >> 32)); /* High 32 bits */ - - /* Shift high 32 bits to position 1 */ - temp = _mm_slli_si128(temp, 4); - - /* Combine low and high parts */ - result = _mm_or_si128(result, temp); - return result; -} - -CRC_AINLINE uint64_t mm_cvtsi128_si64(__m128i val) { - /* Combine two 32-bit values into one 64-bit result */ - uint32_t low = _mm_cvtsi128_si32(val); - uint32_t high = _mm_extract_epi32(val, 1); - return ((uint64_t)high << 32) | low; -} - -CRC_AINLINE uint64_t mm_extract_epi64(__m128i val, int idx) { - /* Extract 64 bits (two 32-bit values) */ - uint32_t low, high; - - if (idx == 0) { - low = _mm_cvtsi128_si32(val); - high = _mm_extract_epi32(val, 1); - } else { - low = _mm_extract_epi32(val, 2); - high = _mm_extract_epi32(val, 3); - } - - return ((uint64_t)high << 32) | low; -} -#endif - -CRC_AINLINE uint32_t crc_u64(uint32_t crc, uint64_t val) { - __m128i k = _mm_setr_epi32(0xf7011641, 0xb4e5b025, 0xdb710641, 1); - __m128i a = mm_cvtsi64_si128(crc ^ val); - __m128i b = _mm_clmulepi64_si128(a, k, 0x00); - __m128i c = _mm_clmulepi64_si128(b, k, 0x10); - return _mm_extract_epi32(c, 2); -} - -#define clmul_lo(a, b) (_mm_clmulepi64_si128((a), (b), 0)) -#define clmul_hi(a, b) (_mm_clmulepi64_si128((a), (b), 17)) - -CRC_AINLINE __m128i clmul_scalar(uint32_t a, uint32_t b) { - return _mm_clmulepi64_si128(_mm_cvtsi32_si128(a), _mm_cvtsi32_si128(b), 0); -} - -CRC_AINLINE uint32_t crc_u32(uint32_t crc, uint32_t val) { - __m128i k = _mm_setr_epi32(0x00000000, 0xf7011641, 0xdb710641, 1); - __m128i a = _mm_cvtsi32_si128(crc ^ val); - __m128i b = _mm_clmulepi64_si128(a, k, 0x00); - __m128i c = _mm_clmulepi64_si128(b, k, 0x10); - return _mm_extract_epi32(c, 2); -} - -static uint32_t xnmodp(uint64_t n) /* x^n mod P, in log(n) time */ { - uint64_t stack = ~(uint64_t)1; - uint32_t acc, low; - for (; n > 191; n = (n >> 1) - 16) { - stack = (stack << 1) + (n & 1); - } - stack = ~stack; - acc = ((uint32_t)0x80000000) >> (n & 31); - for (n >>= 5; n; --n) { - acc = crc_u32(acc, 0); - } - while ((low = stack & 1), stack >>= 1) { - __m128i x = _mm_cvtsi32_si128(acc); - uint64_t y = mm_cvtsi128_si64(_mm_clmulepi64_si128(x, x, 0)); - acc = crc_u64(0, y << low); - } - return acc; -} - -CRC_AINLINE __m128i crc_shift(uint32_t crc, size_t nbytes) { - return clmul_scalar(crc, xnmodp(nbytes * 8 - 33)); -} - -CRC_EXPORT uint32_t crc32_iso_hdlc_impl(uint32_t crc0, const char* buf, size_t len) { - crc0 = ~crc0; - for (; len && ((uintptr_t)buf & 7); --len) { - crc0 = crc_u8(crc0, *buf++); - } - if (((uintptr_t)buf & 8) && len >= 8) { - crc0 = crc_u64(crc0, *(const uint64_t*)buf); - buf += 8; - len -= 8; - } - if (len >= 144) { - size_t blk = (len - 8) / 136; - size_t klen = blk * 24; - const char* buf2 = buf + 0; - uint32_t crc1 = 0; - uint32_t crc2 = 0; - __m128i vc0; - __m128i vc1; - uint64_t vc; - /* First vector chunk. */ - __m128i x0 = _mm_loadu_si128((const __m128i*)buf2), y0; - __m128i x1 = _mm_loadu_si128((const __m128i*)(buf2 + 16)), y1; - __m128i x2 = _mm_loadu_si128((const __m128i*)(buf2 + 32)), y2; - __m128i x3 = _mm_loadu_si128((const __m128i*)(buf2 + 48)), y3; - __m128i k; - k = _mm_setr_epi32(0x8f352d95, 0, 0x1d9513d7, 0); - x0 = _mm_xor_si128(_mm_cvtsi32_si128(crc0), x0); - crc0 = 0; - buf2 += 64; - len -= 136; - buf += blk * 64; - /* Main loop. */ - while (len >= 144) { - y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k); - y1 = clmul_lo(x1, k), x1 = clmul_hi(x1, k); - y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k); - y3 = clmul_lo(x3, k), x3 = clmul_hi(x3, k); - y0 = _mm_xor_si128(y0, _mm_loadu_si128((const __m128i*)buf2)), x0 = _mm_xor_si128(x0, y0); - y1 = _mm_xor_si128(y1, _mm_loadu_si128((const __m128i*)(buf2 + 16))), x1 = _mm_xor_si128(x1, y1); - y2 = _mm_xor_si128(y2, _mm_loadu_si128((const __m128i*)(buf2 + 32))), x2 = _mm_xor_si128(x2, y2); - y3 = _mm_xor_si128(y3, _mm_loadu_si128((const __m128i*)(buf2 + 48))), x3 = _mm_xor_si128(x3, y3); - crc0 = crc_u64(crc0, *(const uint64_t*)buf); - crc1 = crc_u64(crc1, *(const uint64_t*)(buf + klen)); - crc2 = crc_u64(crc2, *(const uint64_t*)(buf + klen * 2)); - crc0 = crc_u64(crc0, *(const uint64_t*)(buf + 8)); - crc1 = crc_u64(crc1, *(const uint64_t*)(buf + klen + 8)); - crc2 = crc_u64(crc2, *(const uint64_t*)(buf + klen * 2 + 8)); - crc0 = crc_u64(crc0, *(const uint64_t*)(buf + 16)); - crc1 = crc_u64(crc1, *(const uint64_t*)(buf + klen + 16)); - crc2 = crc_u64(crc2, *(const uint64_t*)(buf + klen * 2 + 16)); - buf += 24; - buf2 += 64; - len -= 136; - } - /* Reduce x0 ... x3 to just x0. */ - k = _mm_setr_epi32(0xae689191, 0, 0xccaa009e, 0); - y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k); - y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k); - y0 = _mm_xor_si128(y0, x1), x0 = _mm_xor_si128(x0, y0); - y2 = _mm_xor_si128(y2, x3), x2 = _mm_xor_si128(x2, y2); - k = _mm_setr_epi32(0xf1da05aa, 0, 0x81256527, 0); - y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k); - y0 = _mm_xor_si128(y0, x2), x0 = _mm_xor_si128(x0, y0); - /* Final scalar chunk. */ - crc0 = crc_u64(crc0, *(const uint64_t*)buf); - crc1 = crc_u64(crc1, *(const uint64_t*)(buf + klen)); - crc2 = crc_u64(crc2, *(const uint64_t*)(buf + klen * 2)); - crc0 = crc_u64(crc0, *(const uint64_t*)(buf + 8)); - crc1 = crc_u64(crc1, *(const uint64_t*)(buf + klen + 8)); - crc2 = crc_u64(crc2, *(const uint64_t*)(buf + klen * 2 + 8)); - crc0 = crc_u64(crc0, *(const uint64_t*)(buf + 16)); - crc1 = crc_u64(crc1, *(const uint64_t*)(buf + klen + 16)); - crc2 = crc_u64(crc2, *(const uint64_t*)(buf + klen * 2 + 16)); - buf += 24; - vc0 = crc_shift(crc0, klen * 2 + 8); - vc1 = crc_shift(crc1, klen + 8); - vc = mm_extract_epi64(_mm_xor_si128(vc0, vc1), 0); - /* Reduce 128 bits to 32 bits, and multiply by x^32. */ - /* Extract the two 64-bit parts of x0 and combine them */ - uint64_t x0_low = mm_extract_epi64(x0, 0); - uint64_t x0_high = mm_extract_epi64(x0, 1); - uint64_t x0_combined = mm_extract_epi64(crc_shift(crc_u64(crc_u64(0, x0_low), x0_high), klen * 3 + 8), 0); - vc ^= x0_combined; - /* Final 8 bytes. */ - buf += klen * 2; - crc0 = crc2; - crc0 = crc_u64(crc0, *(const uint64_t*)buf ^ vc), buf += 8; - len -= 8; - } - for (; len >= 8; buf += 8, len -= 8) { - crc0 = crc_u64(crc0, *(const uint64_t*)buf); - } - for (; len; --len) { - crc0 = crc_u8(crc0, *buf++); - } - return ~crc0; -} \ No newline at end of file diff --git a/src/algorithm.rs b/src/algorithm.rs index 4082377..20c394b 100644 --- a/src/algorithm.rs +++ b/src/algorithm.rs @@ -25,13 +25,9 @@ use crate::{crc32, crc64}; #[inline] #[cfg_attr( any(target_arch = "x86", target_arch = "x86_64"), - target_feature(enable = "sse2,sse4.1,pclmulqdq") + target_feature(enable = "sse3,sse4.1,pclmulqdq") )] -#[cfg_attr( - all(target_arch = "x86_64", feature = "vpclmulqdq"), - target_feature(enable = "avx2,vpclmulqdq,avx512f,avx512vl") -)] -#[cfg_attr(target_arch = "aarch64", target_feature(enable = "neon,aes"))] +#[cfg_attr(target_arch = "aarch64", target_feature(enable = "aes"))] pub unsafe fn update( state: W::Value, bytes: &[u8], @@ -82,9 +78,9 @@ where #[inline] #[cfg_attr( any(target_arch = "x86", target_arch = "x86_64"), - target_feature(enable = "sse2,sse4.1,pclmulqdq") + target_feature(enable = "ssse3,sse4.1,pclmulqdq") )] -#[cfg_attr(target_arch = "aarch64", target_feature(enable = "neon,aes"))] +#[cfg_attr(target_arch = "aarch64", target_feature(enable = "aes"))] unsafe fn process_by_strategy( strategy: DataChunkProcessor, data: &[u8], @@ -118,13 +114,9 @@ where #[inline] #[cfg_attr( any(target_arch = "x86", target_arch = "x86_64"), - target_feature(enable = "sse2,sse4.1,pclmulqdq") -)] -#[cfg_attr( - all(target_arch = "x86_64", feature = "vpclmulqdq"), - target_feature(enable = "avx2,vpclmulqdq,avx512f,avx512vl") + target_feature(enable = "ssse3,sse4.1,pclmulqdq") )] -#[cfg_attr(target_arch = "aarch64", target_feature(enable = "neon,aes"))] +#[cfg_attr(target_arch = "aarch64", target_feature(enable = "aes"))] unsafe fn process_large_aligned( bytes: &[u8], state: &mut CrcState, @@ -175,9 +167,9 @@ where #[inline] #[cfg_attr( any(target_arch = "x86", target_arch = "x86_64"), - target_feature(enable = "sse2,sse4.1,pclmulqdq") + target_feature(enable = "ssse3,sse4.1,pclmulqdq") )] -#[cfg_attr(target_arch = "aarch64", target_feature(enable = "neon,aes"))] +#[cfg_attr(target_arch = "aarch64", target_feature(enable = "aes"))] unsafe fn process_simd_chunks( state: &mut CrcState, first: &[T::Vector; 8], @@ -255,9 +247,9 @@ unsafe fn process_simd_chunks( #[inline] #[cfg_attr( any(target_arch = "x86", target_arch = "x86_64"), - target_feature(enable = "sse2,sse4.1,pclmulqdq") + target_feature(enable = "ssse3,sse4.1,pclmulqdq") )] -#[cfg_attr(target_arch = "aarch64", target_feature(enable = "neon,aes"))] +#[cfg_attr(target_arch = "aarch64", target_feature(enable = "aes"))] unsafe fn process_exactly_16( data: &[u8], state: &mut CrcState, @@ -281,9 +273,9 @@ where #[inline] #[cfg_attr( any(target_arch = "x86", target_arch = "x86_64"), - target_feature(enable = "sse2,sse4.1,pclmulqdq") + target_feature(enable = "ssse3,sse4.1,pclmulqdq") )] -#[cfg_attr(target_arch = "aarch64", target_feature(enable = "neon,aes"))] +#[cfg_attr(target_arch = "aarch64", target_feature(enable = "aes"))] unsafe fn process_16_byte_block( data_ptr: *const u8, initial_crc: T::Vector, @@ -304,9 +296,9 @@ where #[inline] #[cfg_attr( any(target_arch = "x86", target_arch = "x86_64"), - target_feature(enable = "sse2,sse4.1,pclmulqdq") + target_feature(enable = "ssse3,sse4.1,pclmulqdq") )] -#[cfg_attr(target_arch = "aarch64", target_feature(enable = "neon,aes"))] +#[cfg_attr(target_arch = "aarch64", target_feature(enable = "aes"))] pub(crate) unsafe fn reflect_bytes( reflector: &Reflector, data: T::Vector, @@ -325,9 +317,9 @@ where #[inline] #[cfg_attr( any(target_arch = "x86", target_arch = "x86_64"), - target_feature(enable = "sse2,sse4.1,pclmulqdq") + target_feature(enable = "ssse3,sse4.1,pclmulqdq") )] -#[cfg_attr(target_arch = "aarch64", target_feature(enable = "neon,aes"))] +#[cfg_attr(target_arch = "aarch64", target_feature(enable = "aes"))] unsafe fn fold_and_xor( current: T::Vector, coefficient: T::Vector, @@ -355,9 +347,9 @@ where #[inline] #[cfg_attr( any(target_arch = "x86", target_arch = "x86_64"), - target_feature(enable = "sse2,sse4.1,pclmulqdq") + target_feature(enable = "ssse3,sse4.1,pclmulqdq") )] -#[cfg_attr(target_arch = "aarch64", target_feature(enable = "neon,aes"))] +#[cfg_attr(target_arch = "aarch64", target_feature(enable = "aes"))] unsafe fn process_17_to_31( data: &[u8], state: &mut CrcState, @@ -394,9 +386,9 @@ where #[inline] #[cfg_attr( any(target_arch = "x86", target_arch = "x86_64"), - target_feature(enable = "sse2,sse4.1,pclmulqdq") + target_feature(enable = "ssse3,sse4.1,pclmulqdq") )] -#[cfg_attr(target_arch = "aarch64", target_feature(enable = "neon,aes"))] +#[cfg_attr(target_arch = "aarch64", target_feature(enable = "aes"))] unsafe fn process_32_to_255( data: &[u8], state: &mut CrcState, @@ -456,9 +448,9 @@ where #[inline] #[cfg_attr( any(target_arch = "x86", target_arch = "x86_64"), - target_feature(enable = "sse2,sse4.1,pclmulqdq") + target_feature(enable = "ssse3,sse4.1,pclmulqdq") )] -#[cfg_attr(target_arch = "aarch64", target_feature(enable = "neon,aes"))] +#[cfg_attr(target_arch = "aarch64", target_feature(enable = "aes"))] unsafe fn get_last_two_xmms( data: &[u8], remaining_len: usize, diff --git a/src/arch/aarch64.rs b/src/arch/aarch64.rs index fc653d4..884ed82 100644 --- a/src/arch/aarch64.rs +++ b/src/arch/aarch64.rs @@ -221,7 +221,7 @@ impl ArchOps for AArch64Ops { } #[inline] - #[target_feature(enable = "neon,aes")] + #[target_feature(enable = "aes")] unsafe fn carryless_mul_00(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector { vreinterpretq_u8_p128(vmull_p64( vgetq_lane_p64(vreinterpretq_p64_u8(a), 0), @@ -230,7 +230,7 @@ impl ArchOps for AArch64Ops { } #[inline] - #[target_feature(enable = "neon,aes")] + #[target_feature(enable = "aes")] unsafe fn carryless_mul_01(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector { // Low 64 bits of a, high 64 bits of b let a_low = vgetq_lane_p64(vreinterpretq_p64_u8(a), 1); @@ -239,7 +239,7 @@ impl ArchOps for AArch64Ops { } #[inline] - #[target_feature(enable = "neon,aes")] + #[target_feature(enable = "aes")] unsafe fn carryless_mul_10(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector { vreinterpretq_u8_p128(vmull_p64( vgetq_lane_p64(vreinterpretq_p64_u8(a), 0), @@ -248,7 +248,7 @@ impl ArchOps for AArch64Ops { } #[inline] - #[target_feature(enable = "neon,aes")] + #[target_feature(enable = "aes")] unsafe fn carryless_mul_11(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector { vreinterpretq_u8_p128(vmull_p64( vgetq_lane_p64(vreinterpretq_p64_u8(a), 1), @@ -258,7 +258,7 @@ impl ArchOps for AArch64Ops { #[inline] #[cfg(target_feature = "sha3")] - #[target_feature(enable = "neon,sha3")] + #[target_feature(enable = "sha3")] unsafe fn xor3_vectors( &self, a: Self::Vector, diff --git a/src/arch/mod.rs b/src/arch/mod.rs index c2eae65..22849e0 100644 --- a/src/arch/mod.rs +++ b/src/arch/mod.rs @@ -5,6 +5,9 @@ //! It dispatches to the appropriate architecture-specific implementation //! based on the target architecture. +#[cfg(target_arch = "aarch64")] +use std::arch::is_aarch64_feature_detected; + #[cfg(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64"))] use crate::algorithm; @@ -14,13 +17,14 @@ use crate::structs::CrcParams; use crate::structs::{Width32, Width64}; #[cfg(target_arch = "aarch64")] -use crate::arch::aarch64::AArch64Ops; +use aarch64::AArch64Ops; #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -use crate::arch::x86::X86Ops; +use x86::X86Ops; +//#[rustversion::since(1.89)] #[cfg(all(target_arch = "x86_64", feature = "vpclmulqdq"))] -use crate::arch::vpclmulqdq::Vpclmulqdq512Ops; +use vpclmulqdq::Vpclmulqdq512Ops; mod aarch64; mod software; @@ -33,84 +37,138 @@ mod x86; /// # Safety /// May use native CPU features #[inline] -#[cfg_attr( - any(target_arch = "x86", target_arch = "x86_64"), - target_feature(enable = "sse2,sse4.1,pclmulqdq") -)] -#[cfg_attr( - all(target_arch = "x86_64", feature = "vpclmulqdq"), - target_feature(enable = "avx2,vpclmulqdq,avx512f,avx512vl") -)] -#[cfg_attr(target_arch = "aarch64", target_feature(enable = "neon,aes"))] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "aes")] pub(crate) unsafe fn update(state: u64, bytes: &[u8], params: CrcParams) -> u64 { - #[cfg(target_arch = "aarch64")] + let ops = AArch64Ops; + + match params.width { + 64 => algorithm::update::(state, bytes, params, &ops), + 32 => algorithm::update::(state as u32, bytes, params, &ops) as u64, + _ => panic!("Unsupported CRC width: {}", params.width), + } +} + +//#[rustversion::before(1.89)] +#[inline] +#[cfg(all( + not(feature = "vpclmulqdq"), + any(target_arch = "x86", target_arch = "x86_64") +))] +#[target_feature(enable = "ssse3,sse4.1,pclmulqdq")] +pub(crate) unsafe fn update(state: u64, bytes: &[u8], params: CrcParams) -> u64 { + update_x86_sse(state, bytes, params) +} + +//#[rustversion::since(1.89)] +#[inline] +#[cfg(all(feature = "vpclmulqdq", target_arch = "x86"))] +#[target_feature(enable = "ssse3,sse4.1,pclmulqdq")] +pub(crate) unsafe fn update(state: u64, bytes: &[u8], params: CrcParams) -> u64 { + update_x86_sse(state, bytes, params) +} + +//#[rustversion::since(1.89)] +#[inline] +#[cfg(all(feature = "vpclmulqdq", target_arch = "x86_64"))] +#[target_feature(enable = "ssse3,sse4.1,pclmulqdq")] +pub(crate) unsafe fn update(state: u64, bytes: &[u8], params: CrcParams) -> u64 { + use std::arch::is_x86_feature_detected; + + if bytes.len() >= 256 + && is_x86_feature_detected!("vpclmulqdq") + && is_x86_feature_detected!("avx512f") + && is_x86_feature_detected!("avx512vl") { - let ops = AArch64Ops; + let ops = Vpclmulqdq512Ops::new(); - match params.width { - 64 => algorithm::update::(state, bytes, params, &ops), - 32 => { - algorithm::update::(state as u32, bytes, params, &ops) as u64 - } + return match params.width { + 64 => algorithm::update::(state, bytes, params, &ops), + 32 => algorithm::update::(state as u32, bytes, params, &ops) + as u64, _ => panic!("Unsupported CRC width: {}", params.width), - } + }; } - #[cfg(all(target_arch = "x86_64", feature = "vpclmulqdq"))] - { - use std::arch::is_x86_feature_detected; - - if bytes.len() >= 256 && is_x86_feature_detected!("vpclmulqdq") { - let ops = Vpclmulqdq512Ops::new(); - - return match params.width { - 64 => algorithm::update::(state, bytes, params, &ops), - 32 => algorithm::update::( - state as u32, - bytes, - params, - &ops, - ) as u64, - _ => panic!("Unsupported CRC width: {}", params.width), - }; - } + // fallback to the standard x86 SSE implementation + update_x86_sse(state, bytes, params) +} + +#[inline] +#[cfg(all( + not(target_arch = "x86"), + not(target_arch = "x86_64"), + not(target_arch = "aarch64") +))] +pub(crate) unsafe fn update(state: u64, bytes: &[u8], params: CrcParams) -> u64 { + software::update(state, bytes, params) +} + +#[inline] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +#[target_feature(enable = "ssse3,sse4.1,pclmulqdq")] +unsafe fn update_x86_sse(state: u64, bytes: &[u8], params: CrcParams) -> u64 { + let ops = X86Ops; + + match params.width { + 64 => algorithm::update::(state, bytes, params, &ops), + 32 => algorithm::update::(state as u32, bytes, params, &ops) as u64, + _ => panic!("Unsupported CRC width: {}", params.width), } +} - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +//#[rustversion::before(1.89)] +#[cfg(not(feature = "vpclmulqdq"))] +pub fn get_target() -> String { + #[cfg(target_arch = "aarch64")] { - let ops = X86Ops; - - match params.width { - 64 => algorithm::update::(state, bytes, params, &ops), - 32 => algorithm::update::(state as u32, bytes, params, &ops) as u64, - _ => panic!("Unsupported CRC width: {}", params.width), + if is_aarch64_feature_detected!("sha3") { + return "aarch64-neon-eor3-pclmulqdq".to_string(); } + + "aarch64-neon-pclmulqdq".to_string() } - #[cfg(not(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64")))] - return software::update(state, bytes, params); + #[allow(unreachable_code)] + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + return "x86-sse-pclmulqdq".to_string(); + + #[cfg(not(any(target_arch = "aarch64", target_arch = "x86", target_arch = "x86_64")))] + return "software-fallback-tables".to_string(); } +//#[rustversion::since(1.89)] +#[cfg(feature = "vpclmulqdq")] pub fn get_target() -> String { - #[cfg(all(target_arch = "aarch64", target_feature = "sha3"))] - return "internal-aarch64-neon-eor3".to_string(); + #[cfg(target_arch = "aarch64")] + { + if is_aarch64_feature_detected!("sha3") { + return "aarch64-neon-eor3-pclmulqdq".to_string(); + } - #[cfg(all(target_arch = "aarch64", not(target_feature = "sha3")))] - return "internal-aarch64-neon".to_string(); + "aarch64-neon-pclmulqdq".to_string() + } - #[cfg(all(target_arch = "x86_64", feature = "vpclmulqdq"))] + #[cfg(target_arch = "x86_64")] { - if is_x86_feature_detected!("vpclmulqdq") { - return "internal-x86_64-avx512-vpclmulqdq".to_string(); + if is_x86_feature_detected!("vpclmulqdq") + && is_x86_feature_detected!("avx512f") + && is_x86_feature_detected!("avx512vl") + { + return "x86_64-avx512-vpclmulqdq".to_string(); + } + + if is_x86_feature_detected!("avx2") { + return "x86_64-avx2-pclmulqdq".to_string(); } } #[allow(unreachable_code)] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - return "internal-x86-sse-pclmulqdq".to_string(); + return "x86-sse-pclmulqdq".to_string(); #[cfg(not(any(target_arch = "aarch64", target_arch = "x86", target_arch = "x86_64")))] - return "software-fallback".to_string(); + return "software-fallback-tables".to_string(); } #[cfg(test)] @@ -120,6 +178,7 @@ mod tests { use crate::crc64::consts::CRC64_NVME; use crate::test::consts::{TEST_256_BYTES_STRING, TEST_ALL_CONFIGS, TEST_CHECK_STRING}; use crate::test::create_aligned_data; + use crate::test::enums::AnyCrcTestConfig; use rand::{rng, Rng}; #[test] @@ -289,100 +348,56 @@ mod tests { #[test] fn test_small_lengths_all() { - let mut rng = rng(); - // Test each CRC-64 variant for config in TEST_ALL_CONFIGS { // Test each length from 0 to 255 for len in 0..=255 { - // Generate random data for this length - let mut data = vec![0u8; len]; - rng.fill(&mut data[..]); - - // Calculate expected CRC using the reference implementation - let expected = config.checksum_with_reference(&data); - - // direct update() call, which needs XOROUT applied - let actual = unsafe { - update(config.get_init(), &data, *config.get_params()) ^ config.get_xorout() - }; - - assert_eq!( - actual, - expected, - "\nFailed for {} with length {}\nGot: {:016x}\nExpected: {:016x}", - config.get_name(), - len, - actual, - expected - ); + test_length(len, config); } } } #[test] fn test_medium_lengths() { - let mut rng = rng(); - // Test each CRC-64 variant for config in TEST_ALL_CONFIGS { // Test each length from 256 to 1024, which should fold and include handling remainders for len in 256..=1024 { - // Generate random data for this length - let mut data = vec![0u8; len]; - rng.fill(&mut data[..]); - - // Calculate expected CRC using the reference implementation - let expected = config.checksum_with_reference(&data); - - // direct update() call, which needs XOROUT applied - let actual = unsafe { - update(config.get_init(), &data, *config.get_params()) ^ config.get_xorout() - }; - - assert_eq!( - actual, - expected, - "\nFailed for {} with length {}\nGot: {:016x}\nExpected: {:016x}", - config.get_name(), - len, - actual, - expected - ); + test_length(len, config); } } } #[test] fn test_large_lengths() { - let mut rng = rng(); - // Test each CRC-64 variant for config in TEST_ALL_CONFIGS { // Test ~1 MiB just before, at, and just after the folding boundaries for len in 1048575..=1048577 { - // Generate random data for this length - let mut data = vec![0u8; len]; - rng.fill(&mut data[..]); - - // Calculate expected CRC using the reference implementation - let expected = config.checksum_with_reference(&data); - - // direct update() call, which needs XOROUT applied - let actual = unsafe { - update(config.get_init(), &data, *config.get_params()) ^ config.get_xorout() - }; - - assert_eq!( - actual, - expected, - "\nFailed for {} with length {}\nGot: {:016x}\nExpected: {:016x}", - config.get_name(), - len, - actual, - expected - ); + test_length(len, config); } } } + + fn test_length(length: usize, config: &AnyCrcTestConfig) { + let mut data = vec![0u8; length]; + rng().fill(&mut data[..]); + + // Calculate expected CRC using the reference implementation + let expected = config.checksum_with_reference(&data); + + // direct update() call, which needs XOROUT applied + let actual = + unsafe { update(config.get_init(), &data, *config.get_params()) ^ config.get_xorout() }; + + assert_eq!( + actual, + expected, + "\nFailed for {} with length {}\nGot: {:016x}\nExpected: {:016x}", + config.get_name(), + length, + actual, + expected + ); + } } diff --git a/src/arch/vpclmulqdq.rs b/src/arch/vpclmulqdq.rs index 9fbc97c..515f802 100644 --- a/src/arch/vpclmulqdq.rs +++ b/src/arch/vpclmulqdq.rs @@ -6,18 +6,31 @@ #![cfg(all(target_arch = "x86_64", feature = "vpclmulqdq"))] +//#[rustversion::since(1.89)] use crate::arch::x86::X86Ops; + +//#[rustversion::since(1.89)] use crate::enums::Reflector; + +//#[rustversion::since(1.89)] use crate::structs::CrcState; + +//#[rustversion::since(1.89)] use crate::traits::{ArchOps, EnhancedCrcWidth}; + +//#[rustversion::since(1.89)] use std::arch::x86_64::*; + +//#[rustversion::since(1.89)] use std::ops::BitXor; /// Implements the ArchOps trait using 512-bit AVX-512 and VPCLMULQDQ instructions at 512 bits. /// Delegates to X86Ops for standard 128-bit operations +//#[rustversion::since(1.89)] #[derive(Debug, Copy, Clone)] pub struct Vpclmulqdq512Ops(X86Ops); +//#[rustversion::since(1.89)] impl Vpclmulqdq512Ops { #[inline(always)] pub fn new() -> Self { @@ -26,9 +39,11 @@ impl Vpclmulqdq512Ops { } // Wrapper for __m512i to make it easier to work with +//#[rustversion::since(1.89)] #[derive(Debug, Copy, Clone)] struct Simd512(__m512i); +//#[rustversion::since(1.89)] impl Simd512 { #[inline] #[target_feature(enable = "avx512f")] @@ -97,15 +112,14 @@ impl Simd512 { } } +//#[rustversion::since(1.89)] impl Vpclmulqdq512Ops { /// Process aligned blocks using VPCLMULQDQ with 4 x 512-bit registers /// /// Note that #[inline(always)] loses the inlining performance boost, despite no native /// target_features being used directly. Odd since that's not how Rust's docs make it sound... #[inline] - #[target_feature( - enable = "avx,avx2,avx512f,avx512vl,avx512bw,vpclmulqdq,sse,sse2,sse4.1,pclmulqdq" - )] + #[target_feature(enable = "ssse3,avx2,avx512f,avx512vl,avx512bw,vpclmulqdq,pclmulqdq")] unsafe fn process_blocks( &self, state: &mut CrcState<::Vector>, @@ -325,6 +339,7 @@ impl Vpclmulqdq512Ops { } // 512-bit version of the Reflector +//#[rustversion::since(1.89)] #[derive(Clone, Copy)] enum Reflector512 { NoReflector, @@ -332,6 +347,7 @@ enum Reflector512 { } // Function to create the appropriate reflector based on CRC parameters +//#[rustversion::since(1.89)] #[inline(always)] unsafe fn create_reflector512(reflected: bool) -> Reflector512 { if reflected { @@ -353,6 +369,7 @@ unsafe fn create_reflector512(reflected: bool) -> Reflector512 { } // Function to apply reflection to a 512-bit vector +//#[rustversion::since(1.89)] #[inline(always)] unsafe fn reflect_bytes512(reflector: &Reflector512, data: Simd512) -> Simd512 { match reflector { @@ -362,10 +379,12 @@ unsafe fn reflect_bytes512(reflector: &Reflector512, data: Simd512) -> Simd512 { } // pre-compute the reverse indices for 512-bit shuffling +//#[rustversion::since(1.89)] static REVERSE_INDICES_512: __m512i = unsafe { std::mem::transmute([7u64, 6u64, 5u64, 4u64, 3u64, 2u64, 1u64, 0u64]) }; // Implement a 512-bit byte shuffle function +//#[rustversion::since(1.89)] #[inline] #[target_feature(enable = "avx512f,avx512bw")] unsafe fn shuffle_bytes512(data: Simd512, mask: Simd512) -> Simd512 { @@ -377,6 +396,7 @@ unsafe fn shuffle_bytes512(data: Simd512, mask: Simd512) -> Simd512 { } // Delegate all ArchOps methods to the inner X86Ops instance +//#[rustversion::since(1.89)] impl ArchOps for Vpclmulqdq512Ops { type Vector = __m128i; @@ -405,7 +425,7 @@ impl ArchOps for Vpclmulqdq512Ops { // Delegate all other methods to X86Ops #[inline] - #[target_feature(enable = "sse2,sse4.1")] + #[target_feature(enable = "sse2")] unsafe fn create_vector_from_u64_pair( &self, high: u64, @@ -416,7 +436,7 @@ impl ArchOps for Vpclmulqdq512Ops { } #[inline] - #[target_feature(enable = "sse2,sse4.1")] + #[target_feature(enable = "sse2")] unsafe fn create_vector_from_u64_pair_non_reflected( &self, high: u64, @@ -426,49 +446,49 @@ impl ArchOps for Vpclmulqdq512Ops { } #[inline] - #[target_feature(enable = "sse2,sse4.1")] + #[target_feature(enable = "sse4.1")] unsafe fn create_vector_from_u64(&self, value: u64, high: bool) -> Self::Vector { self.0.create_vector_from_u64(value, high) } #[inline] - #[target_feature(enable = "sse2,sse4.1")] + #[target_feature(enable = "sse2")] unsafe fn extract_u64s(&self, vector: Self::Vector) -> [u64; 2] { self.0.extract_u64s(vector) } #[inline] - #[target_feature(enable = "sse2,sse4.1")] + #[target_feature(enable = "sse2")] unsafe fn extract_poly64s(&self, vector: Self::Vector) -> [u64; 2] { self.0.extract_poly64s(vector) } #[inline] - #[target_feature(enable = "sse2,sse4.1")] + #[target_feature(enable = "sse2")] unsafe fn xor_vectors(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector { self.0.xor_vectors(a, b) } #[inline] - #[target_feature(enable = "sse2,sse4.1")] + #[target_feature(enable = "sse2")] unsafe fn load_bytes(&self, ptr: *const u8) -> Self::Vector { self.0.load_bytes(ptr) } #[inline] - #[target_feature(enable = "sse2,sse4.1")] + #[target_feature(enable = "sse2")] unsafe fn load_aligned(&self, ptr: *const [u64; 2]) -> Self::Vector { self.0.load_aligned(ptr) } #[inline] - #[target_feature(enable = "sse2,sse4.1")] + #[target_feature(enable = "ssse3")] unsafe fn shuffle_bytes(&self, data: Self::Vector, mask: Self::Vector) -> Self::Vector { self.0.shuffle_bytes(data, mask) } #[inline] - #[target_feature(enable = "sse2,sse4.1")] + #[target_feature(enable = "sse4.1")] unsafe fn blend_vectors( &self, a: Self::Vector, @@ -479,115 +499,115 @@ impl ArchOps for Vpclmulqdq512Ops { } #[inline] - #[target_feature(enable = "sse2,sse4.1")] + #[target_feature(enable = "sse2")] unsafe fn shift_left_8(&self, vector: Self::Vector) -> Self::Vector { self.0.shift_left_8(vector) } #[inline] - #[target_feature(enable = "sse2,sse4.1")] + #[target_feature(enable = "sse2")] unsafe fn set_all_bytes(&self, value: u8) -> Self::Vector { self.0.set_all_bytes(value) } #[inline] - #[target_feature(enable = "sse2,sse4.1")] + #[target_feature(enable = "sse2")] unsafe fn create_compare_mask(&self, vector: Self::Vector) -> Self::Vector { self.0.create_compare_mask(vector) } #[inline] - #[target_feature(enable = "sse2,sse4.1")] + #[target_feature(enable = "sse2")] unsafe fn and_vectors(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector { self.0.and_vectors(a, b) } #[inline] - #[target_feature(enable = "sse2,sse4.1")] + #[target_feature(enable = "sse2")] unsafe fn shift_right_32(&self, vector: Self::Vector) -> Self::Vector { self.0.shift_right_32(vector) } #[inline] - #[target_feature(enable = "sse2,sse4.1")] + #[target_feature(enable = "sse2")] unsafe fn shift_left_32(&self, vector: Self::Vector) -> Self::Vector { self.0.shift_left_32(vector) } #[inline] - #[target_feature(enable = "sse2,sse4.1")] + #[target_feature(enable = "sse4.1")] unsafe fn create_vector_from_u32(&self, value: u32, high: bool) -> Self::Vector { self.0.create_vector_from_u32(value, high) } #[inline] - #[target_feature(enable = "sse2,sse4.1")] + #[target_feature(enable = "sse2")] unsafe fn shift_left_4(&self, vector: Self::Vector) -> Self::Vector { self.0.shift_left_4(vector) } #[inline] - #[target_feature(enable = "sse2,sse4.1")] + #[target_feature(enable = "sse2")] unsafe fn shift_right_4(&self, vector: Self::Vector) -> Self::Vector { self.0.shift_right_4(vector) } #[inline] - #[target_feature(enable = "sse2,sse4.1")] + #[target_feature(enable = "sse2")] unsafe fn shift_right_8(&self, vector: Self::Vector) -> Self::Vector { self.0.shift_right_8(vector) } #[inline] - #[target_feature(enable = "sse2,sse4.1")] + #[target_feature(enable = "sse2")] unsafe fn shift_right_5(&self, vector: Self::Vector) -> Self::Vector { self.0.shift_right_5(vector) } #[inline] - #[target_feature(enable = "sse2,sse4.1")] + #[target_feature(enable = "sse2")] unsafe fn shift_right_6(&self, vector: Self::Vector) -> Self::Vector { self.0.shift_right_6(vector) } #[inline] - #[target_feature(enable = "sse2,sse4.1")] + #[target_feature(enable = "sse2")] unsafe fn shift_right_7(&self, vector: Self::Vector) -> Self::Vector { self.0.shift_right_7(vector) } #[inline] - #[target_feature(enable = "sse2,sse4.1")] + #[target_feature(enable = "sse2")] unsafe fn shift_right_12(&self, vector: Self::Vector) -> Self::Vector { self.0.shift_right_12(vector) } #[inline] - #[target_feature(enable = "sse2,sse4.1")] + #[target_feature(enable = "sse2")] unsafe fn shift_left_12(&self, vector: Self::Vector) -> Self::Vector { self.0.shift_left_12(vector) } #[inline] - #[target_feature(enable = "sse2,sse4.1,pclmulqdq")] + #[target_feature(enable = "pclmulqdq")] unsafe fn carryless_mul_00(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector { self.0.carryless_mul_00(a, b) } #[inline] - #[target_feature(enable = "sse2,sse4.1,pclmulqdq")] + #[target_feature(enable = "pclmulqdq")] unsafe fn carryless_mul_01(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector { self.0.carryless_mul_01(a, b) } #[inline] - #[target_feature(enable = "sse2,sse4.1,pclmulqdq")] + #[target_feature(enable = "pclmulqdq")] unsafe fn carryless_mul_10(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector { self.0.carryless_mul_10(a, b) } #[inline] - #[target_feature(enable = "sse2,sse4.1,pclmulqdq")] + #[target_feature(enable = "pclmulqdq")] unsafe fn carryless_mul_11(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector { self.0.carryless_mul_11(a, b) } diff --git a/src/arch/x86.rs b/src/arch/x86.rs index acd3db5..3bf635f 100644 --- a/src/arch/x86.rs +++ b/src/arch/x86.rs @@ -23,7 +23,7 @@ impl ArchOps for X86Ops { type Vector = __m128i; #[inline] - #[target_feature(enable = "sse2,sse4.1")] + #[target_feature(enable = "sse2")] unsafe fn create_vector_from_u64_pair( &self, high: u64, @@ -39,7 +39,7 @@ impl ArchOps for X86Ops { } #[inline] - #[target_feature(enable = "sse2,sse4.1")] + #[target_feature(enable = "sse2")] unsafe fn create_vector_from_u64_pair_non_reflected( &self, high: u64, @@ -50,54 +50,54 @@ impl ArchOps for X86Ops { } #[inline] - #[target_feature(enable = "sse2,sse4.1")] + #[target_feature(enable = "sse4.1")] unsafe fn create_vector_from_u64(&self, value: u64, high: bool) -> Self::Vector { // x86 uses custom helper self.create_u64_vector(value, high) } #[inline] - #[target_feature(enable = "sse2,sse4.1")] + #[target_feature(enable = "sse2")] unsafe fn extract_u64s(&self, vector: Self::Vector) -> [u64; 2] { [self.extract_u64_low(vector), self.extract_u64_high(vector)] } #[inline] - #[target_feature(enable = "sse2,sse4.1")] + #[target_feature(enable = "sse2")] unsafe fn extract_poly64s(&self, vector: Self::Vector) -> [u64; 2] { // On x86, poly64s and u64s extraction is the same self.extract_u64s(vector) } #[inline] - #[target_feature(enable = "sse2,sse4.1")] + #[target_feature(enable = "sse2")] unsafe fn xor_vectors(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector { _mm_xor_si128(a, b) } #[inline] - #[target_feature(enable = "sse2,sse4.1")] + #[target_feature(enable = "sse2")] unsafe fn load_bytes(&self, ptr: *const u8) -> Self::Vector { // x86 requires cast to __m128i* _mm_loadu_si128(ptr as *const __m128i) } #[inline] - #[target_feature(enable = "sse2,sse4.1")] + #[target_feature(enable = "sse2")] unsafe fn load_aligned(&self, ptr: *const [u64; 2]) -> Self::Vector { // x86 requires cast to __m128i* _mm_loadu_si128(ptr as *const __m128i) } #[inline] - #[target_feature(enable = "sse2,sse4.1,ssse3")] + #[target_feature(enable = "ssse3")] unsafe fn shuffle_bytes(&self, data: Self::Vector, mask: Self::Vector) -> Self::Vector { // x86 uses specific SSSE3 instruction _mm_shuffle_epi8(data, mask) } #[inline] - #[target_feature(enable = "sse2,sse4.1")] + #[target_feature(enable = "sse4.1")] unsafe fn blend_vectors( &self, a: Self::Vector, @@ -109,14 +109,14 @@ impl ArchOps for X86Ops { } #[inline] - #[target_feature(enable = "sse2,sse4.1")] + #[target_feature(enable = "sse2")] unsafe fn shift_left_8(&self, vector: Self::Vector) -> Self::Vector { // x86 has a dedicated shift instruction _mm_slli_si128(vector, 8) } #[inline] - #[target_feature(enable = "sse2,sse4.1")] + #[target_feature(enable = "sse2")] unsafe fn set_all_bytes(&self, value: u8) -> Self::Vector { _mm_set1_epi8(value as i8) } @@ -128,25 +128,25 @@ impl ArchOps for X86Ops { } #[inline] - #[target_feature(enable = "sse2,sse4.1")] + #[target_feature(enable = "sse2")] unsafe fn and_vectors(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector { _mm_and_si128(a, b) } #[inline] - #[target_feature(enable = "sse2,sse4.1")] + #[target_feature(enable = "sse2")] unsafe fn shift_right_32(&self, vector: Self::Vector) -> Self::Vector { _mm_srli_si128(vector, 4) } #[inline] - #[target_feature(enable = "sse2,sse4.1")] + #[target_feature(enable = "sse2")] unsafe fn shift_left_32(&self, vector: Self::Vector) -> Self::Vector { _mm_slli_si128(vector, 4) } #[inline] - #[target_feature(enable = "sse2,sse4.1")] + #[target_feature(enable = "sse4.1")] unsafe fn create_vector_from_u32(&self, value: u32, high: bool) -> Self::Vector { if high { _mm_insert_epi32(_mm_set1_epi32(0), value as i32, 3) @@ -156,79 +156,80 @@ impl ArchOps for X86Ops { } #[inline] - #[target_feature(enable = "sse2,sse4.1")] + #[target_feature(enable = "sse2")] unsafe fn shift_left_4(&self, vector: Self::Vector) -> Self::Vector { _mm_slli_si128(vector, 4) } #[inline] - #[target_feature(enable = "sse2,sse4.1")] + #[target_feature(enable = "sse2")] unsafe fn shift_right_4(&self, vector: Self::Vector) -> Self::Vector { _mm_srli_si128(vector, 4) } #[inline] - #[target_feature(enable = "sse2,sse4.1")] + #[target_feature(enable = "sse2")] unsafe fn shift_right_8(&self, vector: Self::Vector) -> Self::Vector { _mm_srli_si128(vector, 8) } #[inline] - #[target_feature(enable = "sse2,sse4.1")] + #[target_feature(enable = "sse2")] unsafe fn shift_right_5(&self, vector: Self::Vector) -> Self::Vector { _mm_srli_si128(vector, 5) } #[inline] - #[target_feature(enable = "sse2,sse4.1")] + #[target_feature(enable = "sse2")] unsafe fn shift_right_6(&self, vector: Self::Vector) -> Self::Vector { _mm_srli_si128(vector, 6) } #[inline] - #[target_feature(enable = "sse2,sse4.1")] + #[target_feature(enable = "sse2")] unsafe fn shift_right_7(&self, vector: Self::Vector) -> Self::Vector { _mm_srli_si128(vector, 7) } #[inline] - #[target_feature(enable = "sse2,sse4.1")] + #[target_feature(enable = "sse2")] unsafe fn shift_right_12(&self, vector: Self::Vector) -> Self::Vector { _mm_srli_si128(vector, 12) } #[inline] - #[target_feature(enable = "sse2,sse4.1")] + #[target_feature(enable = "sse2")] unsafe fn shift_left_12(&self, vector: Self::Vector) -> Self::Vector { _mm_slli_si128(vector, 12) } #[inline] - #[target_feature(enable = "sse2,sse4.1,pclmulqdq")] + #[target_feature(enable = "pclmulqdq")] unsafe fn carryless_mul_00(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector { _mm_clmulepi64_si128(a, b, 0x00) } #[inline] - #[target_feature(enable = "sse2,sse4.1,pclmulqdq")] + #[target_feature(enable = "pclmulqdq")] unsafe fn carryless_mul_01(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector { _mm_clmulepi64_si128(a, b, 0x01) } #[inline] - #[target_feature(enable = "sse2,sse4.1,pclmulqdq")] + #[target_feature(enable = "pclmulqdq")] unsafe fn carryless_mul_10(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector { _mm_clmulepi64_si128(a, b, 0x10) } #[inline] - #[target_feature(enable = "sse2,sse4.1,pclmulqdq")] + #[target_feature(enable = "pclmulqdq")] unsafe fn carryless_mul_11(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector { _mm_clmulepi64_si128(a, b, 0x11) } + //#[rustversion::since(1.89)] #[inline] - #[cfg(any(feature = "vpclmulqdq", feature = "avx512"))] + #[cfg(feature = "vpclmulqdq")] #[target_feature(enable = "avx512f,avx512vl")] unsafe fn xor3_vectors( &self, @@ -236,29 +237,31 @@ impl ArchOps for X86Ops { b: Self::Vector, c: Self::Vector, ) -> Self::Vector { - _mm_ternarylogic_epi64( - a, b, c, 0x96, // XOR3 - ) + if is_x86_feature_detected!("avx512f") && is_x86_feature_detected!("avx512vl") { + return self.xor3_vectors_avx512(a, b, c); + } + + self.xor3_vectors_sse(a, b, c) } + //#[rustversion::before(1.89)] #[inline] - #[cfg(not(any(feature = "vpclmulqdq", feature = "avx512")))] - #[target_feature(enable = "sse2,sse4.1")] + #[cfg(not(feature = "vpclmulqdq"))] + #[target_feature(enable = "sse4.1")] unsafe fn xor3_vectors( &self, a: Self::Vector, b: Self::Vector, c: Self::Vector, ) -> Self::Vector { - // x86 doesn't have native XOR3 in SSE, use two XORs - _mm_xor_si128(_mm_xor_si128(a, b), c) + self.xor3_vectors_sse(a, b, c) } } impl X86Ops { // Helper methods specific to x86/x86_64 #[inline] - #[target_feature(enable = "sse2,sse4.1")] + #[target_feature(enable = "sse2")] unsafe fn set_epi64x(&self, e1: u64, e0: u64) -> __m128i { #[cfg(target_arch = "x86_64")] { @@ -277,7 +280,7 @@ impl X86Ops { } #[inline] - #[target_feature(enable = "sse2,sse4.1")] + #[target_feature(enable = "sse4.1")] unsafe fn create_u64_vector(&self, value: u64, high: bool) -> __m128i { if high { self.set_epi64x(value, 0) @@ -287,7 +290,7 @@ impl X86Ops { } #[inline] - #[target_feature(enable = "sse2,sse4.1")] + #[target_feature(enable = "sse2")] unsafe fn extract_u64_low(&self, v: __m128i) -> u64 { #[cfg(target_arch = "x86_64")] { @@ -303,7 +306,7 @@ impl X86Ops { } #[inline] - #[target_feature(enable = "sse2,sse4.1")] + #[target_feature(enable = "sse2")] unsafe fn extract_u64_high(&self, v: __m128i) -> u64 { #[cfg(target_arch = "x86_64")] { @@ -317,4 +320,21 @@ impl X86Ops { lo | (hi << 32) } } + + //#[rustversion::since(1.89)] + #[inline] + #[cfg(feature = "vpclmulqdq")] + #[target_feature(enable = "avx512f,avx512vl")] + unsafe fn xor3_vectors_avx512(&self, a: __m128i, b: __m128i, c: __m128i) -> __m128i { + _mm_ternarylogic_epi64( + a, b, c, 0x96, // XOR3 + ) + } + + #[inline] + #[target_feature(enable = "sse4.1")] + unsafe fn xor3_vectors_sse(&self, a: __m128i, b: __m128i, c: __m128i) -> __m128i { + // x86 doesn't have native XOR3 in SSE, use two XORs + _mm_xor_si128(_mm_xor_si128(a, b), c) + } } diff --git a/src/bin/arch-check.rs b/src/bin/arch-check.rs index da55e65..0e7cb66 100644 --- a/src/bin/arch-check.rs +++ b/src/bin/arch-check.rs @@ -3,11 +3,12 @@ #[cfg(target_arch = "aarch64")] use std::arch::is_aarch64_feature_detected; -use crc_fast::get_calculator_target; -use crc_fast::CrcAlgorithm::{Crc32Iscsi, Crc32IsoHdlc, Crc64Nvme}; #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] use std::arch::is_x86_feature_detected; +use crc_fast::get_calculator_target; +use crc_fast::CrcAlgorithm::{Crc32Iscsi, Crc32IsoHdlc, Crc64Nvme}; + fn main() { // Check the target architecture and call the appropriate function #[cfg(target_arch = "aarch64")] diff --git a/src/bindings/crc32_iscsi.rs b/src/bindings/crc32_iscsi.rs deleted file mode 100644 index 848d748..0000000 --- a/src/bindings/crc32_iscsi.rs +++ /dev/null @@ -1,13 +0,0 @@ -/* automatically generated by rust-bindgen 0.70.1 */ - -extern "C" { - pub static ISCSI_TARGET: *const ::std::os::raw::c_char; -} -extern "C" { - #[doc = " Gets the target build properties (CPU architecture and fine-tuning parameters) for this implementation."] - pub fn get_iscsi_target() -> *const ::std::os::raw::c_char; -} -extern "C" { - #[doc = " Calculate CRC-32/ISCSI checksum using hardware acceleration\n\n @param crc0 Initial CRC value (typically 0)\n @param buf Pointer to input data buffer\n @param len Length of input data in bytes\n\n @return Calculated CRC-32/ISCSI checksum"] - pub fn crc32_iscsi_impl(crc0: u32, buf: *const ::std::os::raw::c_char, len: usize) -> u32; -} diff --git a/src/bindings/crc32_iso_hdlc.rs b/src/bindings/crc32_iso_hdlc.rs deleted file mode 100644 index 25de98d..0000000 --- a/src/bindings/crc32_iso_hdlc.rs +++ /dev/null @@ -1,13 +0,0 @@ -/* automatically generated by rust-bindgen 0.70.1 */ - -extern "C" { - pub static ISO_HDLC_TARGET: *const ::std::os::raw::c_char; -} -extern "C" { - #[doc = " Gets the target build properties (CPU architecture and fine-tuning parameters) for this implementation."] - pub fn get_iso_hdlc_target() -> *const ::std::os::raw::c_char; -} -extern "C" { - #[doc = " Calculate CRC-32/ISO_HDLC checksum using hardware acceleration\n\n @param crc0 Initial CRC value (typically 0)\n @param buf Pointer to input data buffer\n @param len Length of input data in bytes\n\n @return Calculated CRC-32/ISO_HDLC checksum"] - pub fn crc32_iso_hdlc_impl(crc0: u32, buf: *const ::std::os::raw::c_char, len: usize) -> u32; -} diff --git a/src/bindings/mod.rs b/src/bindings/mod.rs deleted file mode 100644 index 50df665..0000000 --- a/src/bindings/mod.rs +++ /dev/null @@ -1,63 +0,0 @@ -// Copyright 2025 Don MacAskill. Licensed under MIT or Apache-2.0. - -//! This module provides bindings to the C implementations of CRC32 algorithms. - -#![allow(non_upper_case_globals)] -#![allow(non_camel_case_types)] -#![allow(non_snake_case)] -#![allow(unused)] - -use crate::structs::CrcParams; -use std::ffi::CStr; -use std::os::raw::c_char; - -mod crc32_iscsi; -mod crc32_iso_hdlc; - -// note that the initial state needs to be reversed -#[inline(always)] -pub(crate) fn crc32_iso_hdlc(state: u64, data: &[u8], params: CrcParams) -> u64 { - unsafe { - // TODO: Examine the C implementation and see why we have to invert the state... - crc32_iso_hdlc::crc32_iso_hdlc_impl( - !state as u32, - data.as_ptr() as *const c_char, - data.len(), - ) as u64 - ^ params.xorout - } -} - -// note that the initial state needs to be reversed -#[inline(always)] -pub(crate) fn crc32_iscsi(state: u64, data: &[u8], params: CrcParams) -> u64 { - unsafe { - // TODO: Examine the C implementation and see why we have to invert the state... - crc32_iscsi::crc32_iscsi_impl(!state as u32, data.as_ptr() as *const c_char, data.len()) - as u64 - ^ params.xorout - } -} - -#[allow(unused)] -pub unsafe fn get_iso_hdlc_target() -> String { - convert_to_string(crc32_iso_hdlc::get_iso_hdlc_target()) -} - -#[allow(unused)] -pub unsafe fn get_iscsi_target() -> String { - convert_to_string(crc32_iscsi::get_iscsi_target()) -} - -fn convert_to_string(ptr: *const c_char) -> String { - unsafe { - // First ensure the pointer isn't null - assert!(!ptr.is_null()); - - // Convert to CStr - this handles finding the null terminator - let c_str = CStr::from_ptr(ptr); - - // Convert to a regular string, handling any invalid UTF-8 - c_str.to_string_lossy().into_owned() - } -} diff --git a/src/crc32/algorithm.rs b/src/crc32/algorithm.rs index 28e0918..6074cd8 100644 --- a/src/crc32/algorithm.rs +++ b/src/crc32/algorithm.rs @@ -230,9 +230,9 @@ impl EnhancedCrcWidth for crate::structs::Width32 { #[inline] #[cfg_attr( any(target_arch = "x86", target_arch = "x86_64"), - target_feature(enable = "sse2,sse4.1,pclmulqdq") + target_feature(enable = "ssse3,sse4.1,pclmulqdq") )] -#[cfg_attr(target_arch = "aarch64", target_feature(enable = "neon,aes"))] +#[cfg_attr(target_arch = "aarch64", target_feature(enable = "aes"))] pub(crate) unsafe fn process_0_to_15( data: &[u8], state: &mut CrcState, diff --git a/src/crc32/fusion/aarch64.rs b/src/crc32/fusion/aarch64.rs new file mode 100644 index 0000000..c9f0207 --- /dev/null +++ b/src/crc32/fusion/aarch64.rs @@ -0,0 +1,1073 @@ +//! Provides CRC-32/ISCSI and CRC-32/ISO-HDLC calculations using a fusion of native CLMUL +//! instructions and native CRC calculation instructions on aarch64. +//! +//! https://dougallj.wordpress.com/2022/05/22/faster-crc32-on-the-apple-m1/ +//! +//! Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/ +//! with the help of Claude.ai using: +//! +//! ./generate -i neon -p crc32c -a v12e_v1 +//! ./generate -i neon_eor3 -p crc32 -a v9s3x2e_s3 +//! ./generate -i neon -p crc32 -a v12e_v1 +//! +//! Modified as necessary for this Rust implementation. +//! +//! MIT licensed. + +#![cfg(target_arch = "aarch64")] + +use std::arch::aarch64::*; + +/// Safe wrapper for CRC32 iSCSI calculation +#[inline] +#[cfg(target_feature = "sha3")] +pub fn crc32_iscsi(crc: u32, data: &[u8]) -> u32 { + unsafe { + const LARGE_BUFFER_THRESHOLD: usize = 1024; + + // Select implementation based on buffer size + if data.len() <= LARGE_BUFFER_THRESHOLD { + crc32_iscsi_v12e_v1(crc, data.as_ptr(), data.len()) + } else { + crc32_iscsi_eor3_v9s3x2e_s3(crc, data.as_ptr(), data.len()) + } + } +} + +#[inline] +#[cfg(not(target_feature = "sha3"))] +pub fn crc32_iscsi(crc: u32, data: &[u8]) -> u32 { + unsafe { crc32_iscsi_v12e_v1(crc, data.as_ptr(), data.len()) } +} + +/// Safe wrapper for CRC32 ISO-HDLC calculation +#[inline] +#[cfg(target_feature = "sha3")] +pub fn crc32_iso_hdlc(crc: u32, data: &[u8]) -> u32 { + unsafe { + const LARGE_BUFFER_THRESHOLD: usize = 1024; + + // Select implementation based on buffer size + if data.len() <= LARGE_BUFFER_THRESHOLD { + crc32_iso_hdlc_v12e_v1(crc, data.as_ptr(), data.len()) + } else { + crc32_iso_hdlc_eor3_v9s3x2e_s3(crc, data.as_ptr(), data.len()) + } + } +} + +#[inline] +#[cfg(not(target_feature = "sha3"))] +pub fn crc32_iso_hdlc(crc: u32, data: &[u8]) -> u32 { + unsafe { crc32_iso_hdlc_v12e_v1(crc, data.as_ptr(), data.len()) } +} + +#[inline] +#[cfg(target_feature = "sha3")] +#[target_feature(enable = "aes")] +unsafe fn clmul_lo_eor3(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t { + // Polynomial multiply low parts - convert u128 result to uint64x2_t + let result = vmull_p64(vgetq_lane_u64(a, 0), vgetq_lane_u64(b, 0)); + vreinterpretq_u64_p128(result) +} + +#[inline] +#[cfg(target_feature = "sha3")] +#[target_feature(enable = "aes")] +unsafe fn clmul_hi_eor3(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t { + // Polynomial multiply high parts - convert u128 result to uint64x2_t + let result = vmull_p64(vgetq_lane_u64(a, 1), vgetq_lane_u64(b, 1)); + vreinterpretq_u64_p128(result) +} + +#[inline] +#[cfg(target_feature = "sha3")] +#[target_feature(enable = "aes")] +unsafe fn clmul_scalar(a: u32, b: u32) -> uint64x2_t { + // Polynomial multiply scalars - convert u128 result to uint64x2_t + let result = vmull_p64(a as u64, b as u64); + vreinterpretq_u64_p128(result) +} + +// x^n mod P, in log(n) time +#[inline] +#[cfg(target_feature = "sha3")] +#[target_feature(enable = "aes")] +unsafe fn xnmodp_crc32_iscsi(mut n: u64) -> u32 { + let mut stack = !1u64; + let mut acc: u32; + let mut low: u32; + + while n > 191 { + stack = (stack << 1) + (n & 1); + n = (n >> 1) - 16; + } + stack = !stack; + acc = 0x80000000u32 >> (n & 31); + n >>= 5; + + while n > 0 { + // ARM CRC32 instruction + acc = unsafe { __crc32cw(acc, 0) }; + n -= 1; + } + + while { + low = (stack & 1) as u32; + stack >>= 1; + stack != 0 + } { + unsafe { + // Convert to polynomial type and square it + let x = vreinterpret_p8_u64(vmov_n_u64(acc as u64)); + let squared = vmull_p8(x, x); + let y = vgetq_lane_u64(vreinterpretq_u64_p16(squared), 0); + acc = __crc32cd(0, y << low); + } + } + acc +} + +#[inline] +#[cfg(target_feature = "sha3")] +#[target_feature(enable = "aes")] +unsafe fn crc_shift_iscsi(crc: u32, nbytes: usize) -> uint64x2_t { + clmul_scalar(crc, xnmodp_crc32_iscsi((nbytes * 8 - 33) as u64)) +} + +// x^n mod P, in log(n) time +#[inline] +#[cfg(target_feature = "sha3")] +#[target_feature(enable = "aes")] +unsafe fn xnmodp_iso_hdlc(mut n: u64) -> u32 { + let mut stack = !1u64; + let mut acc: u32; + let mut low: u32; + + while n > 191 { + stack = (stack << 1) + (n & 1); + n = (n >> 1) - 16; + } + stack = !stack; + acc = 0x80000000u32 >> (n & 31); + n >>= 5; + + while n > 0 { + // ARM CRC32 instruction (ISO-HDLC uses standard CRC32, not CRC32C) + acc = unsafe { __crc32w(acc, 0) }; + n -= 1; + } + + while { + low = (stack & 1) as u32; + stack >>= 1; + stack != 0 + } { + unsafe { + // Convert to polynomial type and square it + let x = vreinterpret_p8_u64(vmov_n_u64(acc as u64)); + let squared = vmull_p8(x, x); + let y = vgetq_lane_u64(vreinterpretq_u64_p16(squared), 0); + acc = __crc32d(0, y << low); + } + } + acc +} + +#[inline] +#[cfg(target_feature = "sha3")] +#[target_feature(enable = "aes")] +unsafe fn crc_shift_iso_hdlc(crc: u32, nbytes: usize) -> uint64x2_t { + clmul_scalar(crc, xnmodp_iso_hdlc((nbytes * 8 - 33) as u64)) +} + +/// Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/ +/// using: +/// +/// ./generate -i neon_eor3 -p crc32c -a v9s3x2e_s3 +#[inline] +#[cfg(target_feature = "sha3")] +#[target_feature(enable = "aes,sha3")] +unsafe fn crc32_iscsi_eor3_v9s3x2e_s3(mut crc0: u32, mut buf: *const u8, mut len: usize) -> u32 { + // Align to 8-byte boundary + while len > 0 && (buf as usize & 7) != 0 { + crc0 = __crc32cb(crc0, *buf); + buf = buf.add(1); + len -= 1; + } + + // Handle 8-byte alignment + if (buf as usize & 8) != 0 && len >= 8 { + crc0 = __crc32cd(crc0, *(buf as *const u64)); + buf = buf.add(8); + len -= 8; + } + + if len >= 192 { + let end = buf.add(len); + let blk = len / 192; + let klen = blk * 16; + let buf2 = buf.add(klen * 3); + let limit = buf.add(klen).sub(32); + let mut crc1 = 0u32; + let mut crc2 = 0u32; + + // First vector chunk + let mut x0 = vld1q_u64(buf2 as *const u64); + let mut x1 = vld1q_u64(buf2.add(16) as *const u64); + let mut x2 = vld1q_u64(buf2.add(32) as *const u64); + let mut x3 = vld1q_u64(buf2.add(48) as *const u64); + let mut x4 = vld1q_u64(buf2.add(64) as *const u64); + let mut x5 = vld1q_u64(buf2.add(80) as *const u64); + let mut x6 = vld1q_u64(buf2.add(96) as *const u64); + let mut x7 = vld1q_u64(buf2.add(112) as *const u64); + let mut x8 = vld1q_u64(buf2.add(128) as *const u64); + + let k_vals: [u64; 2] = [0x7e908048, 0xc96cfdc0]; + let mut k = vld1q_u64(k_vals.as_ptr()); + let mut buf2 = buf2.add(144); + + // Main loop + while buf <= limit { + let y0 = clmul_lo_eor3(x0, k); + x0 = clmul_hi_eor3(x0, k); + let y1 = clmul_lo_eor3(x1, k); + x1 = clmul_hi_eor3(x1, k); + let y2 = clmul_lo_eor3(x2, k); + x2 = clmul_hi_eor3(x2, k); + let y3 = clmul_lo_eor3(x3, k); + x3 = clmul_hi_eor3(x3, k); + let y4 = clmul_lo_eor3(x4, k); + x4 = clmul_hi_eor3(x4, k); + let y5 = clmul_lo_eor3(x5, k); + x5 = clmul_hi_eor3(x5, k); + let y6 = clmul_lo_eor3(x6, k); + x6 = clmul_hi_eor3(x6, k); + let y7 = clmul_lo_eor3(x7, k); + x7 = clmul_hi_eor3(x7, k); + let y8 = clmul_lo_eor3(x8, k); + x8 = clmul_hi_eor3(x8, k); + + x0 = veor3q_u64(x0, y0, vld1q_u64(buf2 as *const u64)); + x1 = veor3q_u64(x1, y1, vld1q_u64(buf2.add(16) as *const u64)); + x2 = veor3q_u64(x2, y2, vld1q_u64(buf2.add(32) as *const u64)); + x3 = veor3q_u64(x3, y3, vld1q_u64(buf2.add(48) as *const u64)); + x4 = veor3q_u64(x4, y4, vld1q_u64(buf2.add(64) as *const u64)); + x5 = veor3q_u64(x5, y5, vld1q_u64(buf2.add(80) as *const u64)); + x6 = veor3q_u64(x6, y6, vld1q_u64(buf2.add(96) as *const u64)); + x7 = veor3q_u64(x7, y7, vld1q_u64(buf2.add(112) as *const u64)); + x8 = veor3q_u64(x8, y8, vld1q_u64(buf2.add(128) as *const u64)); + + crc0 = __crc32cd(crc0, *(buf as *const u64)); + crc1 = __crc32cd(crc1, *(buf.add(klen) as *const u64)); + crc2 = __crc32cd(crc2, *(buf.add(klen * 2) as *const u64)); + crc0 = __crc32cd(crc0, *(buf.add(8) as *const u64)); + crc1 = __crc32cd(crc1, *(buf.add(klen + 8) as *const u64)); + crc2 = __crc32cd(crc2, *(buf.add(klen * 2 + 8) as *const u64)); + + buf = buf.add(16); + buf2 = buf2.add(144); + } + + // Reduce x0 ... x8 to just x0 + let k_vals: [u64; 2] = [0xf20c0dfe, 0x493c7d27]; + k = vld1q_u64(k_vals.as_ptr()); + + let y0 = clmul_lo_eor3(x0, k); + x0 = clmul_hi_eor3(x0, k); + x0 = veor3q_u64(x0, y0, x1); + x1 = x2; + x2 = x3; + x3 = x4; + x4 = x5; + x5 = x6; + x6 = x7; + x7 = x8; + + let y0 = clmul_lo_eor3(x0, k); + x0 = clmul_hi_eor3(x0, k); + let y2 = clmul_lo_eor3(x2, k); + x2 = clmul_hi_eor3(x2, k); + let y4 = clmul_lo_eor3(x4, k); + x4 = clmul_hi_eor3(x4, k); + let y6 = clmul_lo_eor3(x6, k); + x6 = clmul_hi_eor3(x6, k); + + x0 = veor3q_u64(x0, y0, x1); + x2 = veor3q_u64(x2, y2, x3); + x4 = veor3q_u64(x4, y4, x5); + x6 = veor3q_u64(x6, y6, x7); + + let k_vals: [u64; 2] = [0x3da6d0cb, 0xba4fc28e]; + k = vld1q_u64(k_vals.as_ptr()); + + let y0 = clmul_lo_eor3(x0, k); + x0 = clmul_hi_eor3(x0, k); + let y4 = clmul_lo_eor3(x4, k); + x4 = clmul_hi_eor3(x4, k); + + x0 = veor3q_u64(x0, y0, x2); + x4 = veor3q_u64(x4, y4, x6); + + let k_vals: [u64; 2] = [0x740eef02, 0x9e4addf8]; + k = vld1q_u64(k_vals.as_ptr()); + + let y0 = clmul_lo_eor3(x0, k); + x0 = clmul_hi_eor3(x0, k); + x0 = veor3q_u64(x0, y0, x4); + + // Final scalar chunk + crc0 = __crc32cd(crc0, *(buf as *const u64)); + crc1 = __crc32cd(crc1, *(buf.add(klen) as *const u64)); + crc2 = __crc32cd(crc2, *(buf.add(klen * 2) as *const u64)); + crc0 = __crc32cd(crc0, *(buf.add(8) as *const u64)); + crc1 = __crc32cd(crc1, *(buf.add(klen + 8) as *const u64)); + crc2 = __crc32cd(crc2, *(buf.add(klen * 2 + 8) as *const u64)); + + let vc0 = crc_shift_iscsi(crc0, klen * 2 + blk * 144); + let vc1 = crc_shift_iscsi(crc1, klen + blk * 144); + let vc2 = crc_shift_iscsi(crc2, blk * 144); + let vc = vgetq_lane_u64(veor3q_u64(vc0, vc1, vc2), 0); + + // Reduce 128 bits to 32 bits, and multiply by x^32 + crc0 = __crc32cd(0, vgetq_lane_u64(x0, 0)); + crc0 = __crc32cd(crc0, vc ^ vgetq_lane_u64(x0, 1)); + + buf = buf2; + len = end.offset_from(buf) as usize; + } + + if len >= 32 { + let klen = ((len - 8) / 24) * 8; + let mut crc1 = 0u32; + let mut crc2 = 0u32; + + // Main loop + loop { + crc0 = __crc32cd(crc0, *(buf as *const u64)); + crc1 = __crc32cd(crc1, *(buf.add(klen) as *const u64)); + crc2 = __crc32cd(crc2, *(buf.add(klen * 2) as *const u64)); + buf = buf.add(8); + len -= 24; + if len < 32 { + break; + } + } + + let vc0 = crc_shift_iscsi(crc0, klen * 2 + 8); + let vc1 = crc_shift_iscsi(crc1, klen + 8); + let vc = vgetq_lane_u64(veorq_u64(vc0, vc1), 0); + + // Final 8 bytes + buf = buf.add(klen * 2); + crc0 = crc2; + crc0 = __crc32cd(crc0, *(buf as *const u64) ^ vc); + buf = buf.add(8); + len -= 8; + } + + while len >= 8 { + crc0 = __crc32cd(crc0, *(buf as *const u64)); + buf = buf.add(8); + len -= 8; + } + + while len > 0 { + crc0 = __crc32cb(crc0, *buf); + buf = buf.add(1); + len -= 1; + } + + crc0 +} + +#[inline] +#[target_feature(enable = "aes")] +unsafe fn clmul_lo_e(a: uint64x2_t, b: uint64x2_t, c: uint64x2_t) -> uint64x2_t { + // Polynomial multiply low parts and XOR with c + let mul_result = vmull_p64(vgetq_lane_u64(a, 0), vgetq_lane_u64(b, 0)); + let mul_vec = vreinterpretq_u64_p128(mul_result); + veorq_u64(mul_vec, c) +} + +#[inline] +#[target_feature(enable = "aes")] +unsafe fn clmul_hi_e(a: uint64x2_t, b: uint64x2_t, c: uint64x2_t) -> uint64x2_t { + // Polynomial multiply high parts and XOR with c + let mul_result = vmull_p64(vgetq_lane_u64(a, 1), vgetq_lane_u64(b, 1)); + let mul_vec = vreinterpretq_u64_p128(mul_result); + veorq_u64(mul_vec, c) +} + +/// Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/ +/// using: +/// +/// ./generate -i neon -p crc32c -a v12e_v1 +#[inline] +#[target_feature(enable = "aes")] +unsafe fn crc32_iscsi_v12e_v1(mut crc0: u32, mut buf: *const u8, mut len: usize) -> u32 { + // Align to 8-byte boundary + while len > 0 && (buf as usize & 7) != 0 { + crc0 = __crc32cb(crc0, *buf); + buf = buf.add(1); + len -= 1; + } + + // Handle 8-byte alignment + if (buf as usize & 8) != 0 && len >= 8 { + crc0 = __crc32cd(crc0, *(buf as *const u64)); + buf = buf.add(8); + len -= 8; + } + + if len >= 192 { + let end = buf.add(len); + let limit = buf.add(len - 192); + + // First vector chunk + let mut x0 = vld1q_u64(buf as *const u64); + let mut x1 = vld1q_u64(buf.add(16) as *const u64); + let mut x2 = vld1q_u64(buf.add(32) as *const u64); + let mut x3 = vld1q_u64(buf.add(48) as *const u64); + let mut x4 = vld1q_u64(buf.add(64) as *const u64); + let mut x5 = vld1q_u64(buf.add(80) as *const u64); + let mut x6 = vld1q_u64(buf.add(96) as *const u64); + let mut x7 = vld1q_u64(buf.add(112) as *const u64); + let mut x8 = vld1q_u64(buf.add(128) as *const u64); + let mut x9 = vld1q_u64(buf.add(144) as *const u64); + let mut x10 = vld1q_u64(buf.add(160) as *const u64); + let mut x11 = vld1q_u64(buf.add(176) as *const u64); + + let k_vals: [u64; 2] = [0xa87ab8a8, 0xab7aff2a]; + let mut k = vld1q_u64(k_vals.as_ptr()); + + // Create CRC vector and XOR with first vector + let crc_vec = vsetq_lane_u64(crc0 as u64, vmovq_n_u64(0), 0); + x0 = veorq_u64(crc_vec, x0); + buf = buf.add(192); + + // Main loop + while buf <= limit { + let y0 = clmul_lo_e(x0, k, vld1q_u64(buf as *const u64)); + x0 = clmul_hi_e(x0, k, y0); + let y1 = clmul_lo_e(x1, k, vld1q_u64(buf.add(16) as *const u64)); + x1 = clmul_hi_e(x1, k, y1); + let y2 = clmul_lo_e(x2, k, vld1q_u64(buf.add(32) as *const u64)); + x2 = clmul_hi_e(x2, k, y2); + let y3 = clmul_lo_e(x3, k, vld1q_u64(buf.add(48) as *const u64)); + x3 = clmul_hi_e(x3, k, y3); + let y4 = clmul_lo_e(x4, k, vld1q_u64(buf.add(64) as *const u64)); + x4 = clmul_hi_e(x4, k, y4); + let y5 = clmul_lo_e(x5, k, vld1q_u64(buf.add(80) as *const u64)); + x5 = clmul_hi_e(x5, k, y5); + let y6 = clmul_lo_e(x6, k, vld1q_u64(buf.add(96) as *const u64)); + x6 = clmul_hi_e(x6, k, y6); + let y7 = clmul_lo_e(x7, k, vld1q_u64(buf.add(112) as *const u64)); + x7 = clmul_hi_e(x7, k, y7); + let y8 = clmul_lo_e(x8, k, vld1q_u64(buf.add(128) as *const u64)); + x8 = clmul_hi_e(x8, k, y8); + let y9 = clmul_lo_e(x9, k, vld1q_u64(buf.add(144) as *const u64)); + x9 = clmul_hi_e(x9, k, y9); + let y10 = clmul_lo_e(x10, k, vld1q_u64(buf.add(160) as *const u64)); + x10 = clmul_hi_e(x10, k, y10); + let y11 = clmul_lo_e(x11, k, vld1q_u64(buf.add(176) as *const u64)); + x11 = clmul_hi_e(x11, k, y11); + buf = buf.add(192); + } + + // Reduce x0 ... x11 to just x0 + let k_vals: [u64; 2] = [0xf20c0dfe, 0x493c7d27]; + k = vld1q_u64(k_vals.as_ptr()); + + let y0 = clmul_lo_e(x0, k, x1); + x0 = clmul_hi_e(x0, k, y0); + let y2 = clmul_lo_e(x2, k, x3); + x2 = clmul_hi_e(x2, k, y2); + let y4 = clmul_lo_e(x4, k, x5); + x4 = clmul_hi_e(x4, k, y4); + let y6 = clmul_lo_e(x6, k, x7); + x6 = clmul_hi_e(x6, k, y6); + let y8 = clmul_lo_e(x8, k, x9); + x8 = clmul_hi_e(x8, k, y8); + let y10 = clmul_lo_e(x10, k, x11); + x10 = clmul_hi_e(x10, k, y10); + + let k_vals: [u64; 2] = [0x3da6d0cb, 0xba4fc28e]; + k = vld1q_u64(k_vals.as_ptr()); + + let y0 = clmul_lo_e(x0, k, x2); + x0 = clmul_hi_e(x0, k, y0); + let y4 = clmul_lo_e(x4, k, x6); + x4 = clmul_hi_e(x4, k, y4); + let y8 = clmul_lo_e(x8, k, x10); + x8 = clmul_hi_e(x8, k, y8); + + let k_vals: [u64; 2] = [0x740eef02, 0x9e4addf8]; + k = vld1q_u64(k_vals.as_ptr()); + + let y0 = clmul_lo_e(x0, k, x4); + x0 = clmul_hi_e(x0, k, y0); + x4 = x8; + let y0 = clmul_lo_e(x0, k, x4); + x0 = clmul_hi_e(x0, k, y0); + + // Reduce 128 bits to 32 bits, and multiply by x^32 + crc0 = __crc32cd(0, vgetq_lane_u64(x0, 0)); + crc0 = __crc32cd(crc0, vgetq_lane_u64(x0, 1)); + len = end.offset_from(buf) as usize; + } + + if len >= 16 { + // First vector chunk + let mut x0 = vld1q_u64(buf as *const u64); + + let k_vals: [u64; 2] = [0xf20c0dfe, 0x493c7d27]; + let k = vld1q_u64(k_vals.as_ptr()); + + // Create CRC vector and XOR with first vector + let crc_vec = vsetq_lane_u64(crc0 as u64, vmovq_n_u64(0), 0); + x0 = veorq_u64(crc_vec, x0); + buf = buf.add(16); + len -= 16; + + // Main loop + while len >= 16 { + let y0 = clmul_lo_e(x0, k, vld1q_u64(buf as *const u64)); + x0 = clmul_hi_e(x0, k, y0); + buf = buf.add(16); + len -= 16; + } + + // Reduce 128 bits to 32 bits, and multiply by x^32 + crc0 = __crc32cd(0, vgetq_lane_u64(x0, 0)); + crc0 = __crc32cd(crc0, vgetq_lane_u64(x0, 1)); + } + + while len >= 8 { + crc0 = __crc32cd(crc0, *(buf as *const u64)); + buf = buf.add(8); + len -= 8; + } + + while len > 0 { + crc0 = __crc32cb(crc0, *buf); + buf = buf.add(1); + len -= 1; + } + + crc0 +} + +/// Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/ +/// using: +/// +/// ./generate -i neon_eor3 -p crc32 -a v9s3x2e_s3 +#[inline] +#[cfg(target_feature = "sha3")] +#[target_feature(enable = "aes,sha3")] +unsafe fn crc32_iso_hdlc_eor3_v9s3x2e_s3(mut crc0: u32, mut buf: *const u8, mut len: usize) -> u32 { + // Align to 8-byte boundary + while len > 0 && (buf as usize & 7) != 0 { + crc0 = __crc32b(crc0, *buf); + buf = buf.add(1); + len -= 1; + } + + // Handle 8-byte alignment + if (buf as usize & 8) != 0 && len >= 8 { + crc0 = __crc32d(crc0, *(buf as *const u64)); + buf = buf.add(8); + len -= 8; + } + + if len >= 192 { + let end = buf.add(len); + let blk = len / 192; + let klen = blk * 16; + let buf2 = buf.add(klen * 3); + let limit = buf.add(klen).sub(32); + let mut crc1 = 0u32; + let mut crc2 = 0u32; + + // First vector chunk + let mut x0 = vld1q_u64(buf2 as *const u64); + let mut x1 = vld1q_u64(buf2.add(16) as *const u64); + let mut x2 = vld1q_u64(buf2.add(32) as *const u64); + let mut x3 = vld1q_u64(buf2.add(48) as *const u64); + let mut x4 = vld1q_u64(buf2.add(64) as *const u64); + let mut x5 = vld1q_u64(buf2.add(80) as *const u64); + let mut x6 = vld1q_u64(buf2.add(96) as *const u64); + let mut x7 = vld1q_u64(buf2.add(112) as *const u64); + let mut x8 = vld1q_u64(buf2.add(128) as *const u64); + + // ISO-HDLC specific constants + let k_vals: [u64; 2] = [0x26b70c3d, 0x3f41287a]; + let mut k = vld1q_u64(k_vals.as_ptr()); + let mut buf2 = buf2.add(144); + + // Main loop + while buf <= limit { + let y0 = clmul_lo_eor3(x0, k); + x0 = clmul_hi_eor3(x0, k); + let y1 = clmul_lo_eor3(x1, k); + x1 = clmul_hi_eor3(x1, k); + let y2 = clmul_lo_eor3(x2, k); + x2 = clmul_hi_eor3(x2, k); + let y3 = clmul_lo_eor3(x3, k); + x3 = clmul_hi_eor3(x3, k); + let y4 = clmul_lo_eor3(x4, k); + x4 = clmul_hi_eor3(x4, k); + let y5 = clmul_lo_eor3(x5, k); + x5 = clmul_hi_eor3(x5, k); + let y6 = clmul_lo_eor3(x6, k); + x6 = clmul_hi_eor3(x6, k); + let y7 = clmul_lo_eor3(x7, k); + x7 = clmul_hi_eor3(x7, k); + let y8 = clmul_lo_eor3(x8, k); + x8 = clmul_hi_eor3(x8, k); + + x0 = veor3q_u64(x0, y0, vld1q_u64(buf2 as *const u64)); + x1 = veor3q_u64(x1, y1, vld1q_u64(buf2.add(16) as *const u64)); + x2 = veor3q_u64(x2, y2, vld1q_u64(buf2.add(32) as *const u64)); + x3 = veor3q_u64(x3, y3, vld1q_u64(buf2.add(48) as *const u64)); + x4 = veor3q_u64(x4, y4, vld1q_u64(buf2.add(64) as *const u64)); + x5 = veor3q_u64(x5, y5, vld1q_u64(buf2.add(80) as *const u64)); + x6 = veor3q_u64(x6, y6, vld1q_u64(buf2.add(96) as *const u64)); + x7 = veor3q_u64(x7, y7, vld1q_u64(buf2.add(112) as *const u64)); + x8 = veor3q_u64(x8, y8, vld1q_u64(buf2.add(128) as *const u64)); + + crc0 = __crc32d(crc0, *(buf as *const u64)); + crc1 = __crc32d(crc1, *(buf.add(klen) as *const u64)); + crc2 = __crc32d(crc2, *(buf.add(klen * 2) as *const u64)); + crc0 = __crc32d(crc0, *(buf.add(8) as *const u64)); + crc1 = __crc32d(crc1, *(buf.add(klen + 8) as *const u64)); + crc2 = __crc32d(crc2, *(buf.add(klen * 2 + 8) as *const u64)); + + buf = buf.add(16); + buf2 = buf2.add(144); + } + + // Reduce x0 ... x8 to just x0 + let k_vals: [u64; 2] = [0xae689191, 0xccaa009e]; + k = vld1q_u64(k_vals.as_ptr()); + + let y0 = clmul_lo_eor3(x0, k); + x0 = clmul_hi_eor3(x0, k); + x0 = veor3q_u64(x0, y0, x1); + x1 = x2; + x2 = x3; + x3 = x4; + x4 = x5; + x5 = x6; + x6 = x7; + x7 = x8; + + let y0 = clmul_lo_eor3(x0, k); + x0 = clmul_hi_eor3(x0, k); + let y2 = clmul_lo_eor3(x2, k); + x2 = clmul_hi_eor3(x2, k); + let y4 = clmul_lo_eor3(x4, k); + x4 = clmul_hi_eor3(x4, k); + let y6 = clmul_lo_eor3(x6, k); + x6 = clmul_hi_eor3(x6, k); + + x0 = veor3q_u64(x0, y0, x1); + x2 = veor3q_u64(x2, y2, x3); + x4 = veor3q_u64(x4, y4, x5); + x6 = veor3q_u64(x6, y6, x7); + + let k_vals: [u64; 2] = [0xf1da05aa, 0x81256527]; + k = vld1q_u64(k_vals.as_ptr()); + + let y0 = clmul_lo_eor3(x0, k); + x0 = clmul_hi_eor3(x0, k); + let y4 = clmul_lo_eor3(x4, k); + x4 = clmul_hi_eor3(x4, k); + + x0 = veor3q_u64(x0, y0, x2); + x4 = veor3q_u64(x4, y4, x6); + + let k_vals: [u64; 2] = [0x8f352d95, 0x1d9513d7]; + k = vld1q_u64(k_vals.as_ptr()); + + let y0 = clmul_lo_eor3(x0, k); + x0 = clmul_hi_eor3(x0, k); + x0 = veor3q_u64(x0, y0, x4); + + // Final scalar chunk + crc0 = __crc32d(crc0, *(buf as *const u64)); + crc1 = __crc32d(crc1, *(buf.add(klen) as *const u64)); + crc2 = __crc32d(crc2, *(buf.add(klen * 2) as *const u64)); + crc0 = __crc32d(crc0, *(buf.add(8) as *const u64)); + crc1 = __crc32d(crc1, *(buf.add(klen + 8) as *const u64)); + crc2 = __crc32d(crc2, *(buf.add(klen * 2 + 8) as *const u64)); + + let vc0 = crc_shift_iso_hdlc(crc0, klen * 2 + blk * 144); + let vc1 = crc_shift_iso_hdlc(crc1, klen + blk * 144); + let vc2 = crc_shift_iso_hdlc(crc2, blk * 144); + let vc = vgetq_lane_u64(veor3q_u64(vc0, vc1, vc2), 0); + + // Reduce 128 bits to 32 bits, and multiply by x^32 + crc0 = __crc32d(0, vgetq_lane_u64(x0, 0)); + crc0 = __crc32d(crc0, vc ^ vgetq_lane_u64(x0, 1)); + + buf = buf2; + len = end.offset_from(buf) as usize; + } + + if len >= 32 { + let klen = ((len - 8) / 24) * 8; + let mut crc1 = 0u32; + let mut crc2 = 0u32; + + // Main loop + loop { + crc0 = __crc32d(crc0, *(buf as *const u64)); + crc1 = __crc32d(crc1, *(buf.add(klen) as *const u64)); + crc2 = __crc32d(crc2, *(buf.add(klen * 2) as *const u64)); + buf = buf.add(8); + len -= 24; + if len < 32 { + break; + } + } + + let vc0 = crc_shift_iso_hdlc(crc0, klen * 2 + 8); + let vc1 = crc_shift_iso_hdlc(crc1, klen + 8); + let vc = vgetq_lane_u64(veorq_u64(vc0, vc1), 0); + + // Final 8 bytes + buf = buf.add(klen * 2); + crc0 = crc2; + crc0 = __crc32d(crc0, *(buf as *const u64) ^ vc); + buf = buf.add(8); + len -= 8; + } + + while len >= 8 { + crc0 = __crc32d(crc0, *(buf as *const u64)); + buf = buf.add(8); + len -= 8; + } + + while len > 0 { + crc0 = __crc32b(crc0, *buf); + buf = buf.add(1); + len -= 1; + } + + crc0 +} + +/// Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/ +/// using: +/// +/// ./generate -i neon -p crc32 -a v12e_v1 +#[inline] +#[target_feature(enable = "aes")] +unsafe fn crc32_iso_hdlc_v12e_v1(mut crc0: u32, mut buf: *const u8, mut len: usize) -> u32 { + // Align to 8-byte boundary + while len > 0 && (buf as usize & 7) != 0 { + crc0 = __crc32b(crc0, *buf); + buf = buf.add(1); + len -= 1; + } + + // Handle 8-byte alignment + if (buf as usize & 8) != 0 && len >= 8 { + crc0 = __crc32d(crc0, *(buf as *const u64)); + buf = buf.add(8); + len -= 8; + } + + if len >= 192 { + let end = buf.add(len); + let limit = buf.add(len - 192); + + // First vector chunk + let mut x0 = vld1q_u64(buf as *const u64); + let mut x1 = vld1q_u64(buf.add(16) as *const u64); + let mut x2 = vld1q_u64(buf.add(32) as *const u64); + let mut x3 = vld1q_u64(buf.add(48) as *const u64); + let mut x4 = vld1q_u64(buf.add(64) as *const u64); + let mut x5 = vld1q_u64(buf.add(80) as *const u64); + let mut x6 = vld1q_u64(buf.add(96) as *const u64); + let mut x7 = vld1q_u64(buf.add(112) as *const u64); + let mut x8 = vld1q_u64(buf.add(128) as *const u64); + let mut x9 = vld1q_u64(buf.add(144) as *const u64); + let mut x10 = vld1q_u64(buf.add(160) as *const u64); + let mut x11 = vld1q_u64(buf.add(176) as *const u64); + + // ISO-HDLC specific constants for small implementation + let k_vals: [u64; 2] = [0x596c8d81, 0xf5e48c85]; + let mut k = vld1q_u64(k_vals.as_ptr()); + + // Create CRC vector and XOR with first vector + let crc_vec = vsetq_lane_u64(crc0 as u64, vmovq_n_u64(0), 0); + x0 = veorq_u64(crc_vec, x0); + buf = buf.add(192); + + // Main loop + while buf <= limit { + let y0 = clmul_lo_e(x0, k, vld1q_u64(buf as *const u64)); + x0 = clmul_hi_e(x0, k, y0); + let y1 = clmul_lo_e(x1, k, vld1q_u64(buf.add(16) as *const u64)); + x1 = clmul_hi_e(x1, k, y1); + let y2 = clmul_lo_e(x2, k, vld1q_u64(buf.add(32) as *const u64)); + x2 = clmul_hi_e(x2, k, y2); + let y3 = clmul_lo_e(x3, k, vld1q_u64(buf.add(48) as *const u64)); + x3 = clmul_hi_e(x3, k, y3); + let y4 = clmul_lo_e(x4, k, vld1q_u64(buf.add(64) as *const u64)); + x4 = clmul_hi_e(x4, k, y4); + let y5 = clmul_lo_e(x5, k, vld1q_u64(buf.add(80) as *const u64)); + x5 = clmul_hi_e(x5, k, y5); + let y6 = clmul_lo_e(x6, k, vld1q_u64(buf.add(96) as *const u64)); + x6 = clmul_hi_e(x6, k, y6); + let y7 = clmul_lo_e(x7, k, vld1q_u64(buf.add(112) as *const u64)); + x7 = clmul_hi_e(x7, k, y7); + let y8 = clmul_lo_e(x8, k, vld1q_u64(buf.add(128) as *const u64)); + x8 = clmul_hi_e(x8, k, y8); + let y9 = clmul_lo_e(x9, k, vld1q_u64(buf.add(144) as *const u64)); + x9 = clmul_hi_e(x9, k, y9); + let y10 = clmul_lo_e(x10, k, vld1q_u64(buf.add(160) as *const u64)); + x10 = clmul_hi_e(x10, k, y10); + let y11 = clmul_lo_e(x11, k, vld1q_u64(buf.add(176) as *const u64)); + x11 = clmul_hi_e(x11, k, y11); + buf = buf.add(192); + } + + // Reduce x0 ... x11 to just x0 + let k_vals: [u64; 2] = [0xae689191, 0xccaa009e]; + k = vld1q_u64(k_vals.as_ptr()); + + let y0 = clmul_lo_e(x0, k, x1); + x0 = clmul_hi_e(x0, k, y0); + let y2 = clmul_lo_e(x2, k, x3); + x2 = clmul_hi_e(x2, k, y2); + let y4 = clmul_lo_e(x4, k, x5); + x4 = clmul_hi_e(x4, k, y4); + let y6 = clmul_lo_e(x6, k, x7); + x6 = clmul_hi_e(x6, k, y6); + let y8 = clmul_lo_e(x8, k, x9); + x8 = clmul_hi_e(x8, k, y8); + let y10 = clmul_lo_e(x10, k, x11); + x10 = clmul_hi_e(x10, k, y10); + + let k_vals: [u64; 2] = [0xf1da05aa, 0x81256527]; + k = vld1q_u64(k_vals.as_ptr()); + + let y0 = clmul_lo_e(x0, k, x2); + x0 = clmul_hi_e(x0, k, y0); + let y4 = clmul_lo_e(x4, k, x6); + x4 = clmul_hi_e(x4, k, y4); + let y8 = clmul_lo_e(x8, k, x10); + x8 = clmul_hi_e(x8, k, y8); + + let k_vals: [u64; 2] = [0x8f352d95, 0x1d9513d7]; + k = vld1q_u64(k_vals.as_ptr()); + + let y0 = clmul_lo_e(x0, k, x4); + x0 = clmul_hi_e(x0, k, y0); + x4 = x8; + let y0 = clmul_lo_e(x0, k, x4); + x0 = clmul_hi_e(x0, k, y0); + + // Reduce 128 bits to 32 bits, and multiply by x^32 + crc0 = __crc32d(0, vgetq_lane_u64(x0, 0)); + crc0 = __crc32d(crc0, vgetq_lane_u64(x0, 1)); + len = end.offset_from(buf) as usize; + } + + if len >= 16 { + // First vector chunk + let mut x0 = vld1q_u64(buf as *const u64); + + let k_vals: [u64; 2] = [0xae689191, 0xccaa009e]; + let k = vld1q_u64(k_vals.as_ptr()); + + // Create CRC vector and XOR with first vector + let crc_vec = vsetq_lane_u64(crc0 as u64, vmovq_n_u64(0), 0); + x0 = veorq_u64(crc_vec, x0); + buf = buf.add(16); + len -= 16; + + // Main loop + while len >= 16 { + let y0 = clmul_lo_e(x0, k, vld1q_u64(buf as *const u64)); + x0 = clmul_hi_e(x0, k, y0); + buf = buf.add(16); + len -= 16; + } + + // Reduce 128 bits to 32 bits, and multiply by x^32 + crc0 = __crc32d(0, vgetq_lane_u64(x0, 0)); + crc0 = __crc32d(crc0, vgetq_lane_u64(x0, 1)); + } + + while len >= 8 { + crc0 = __crc32d(crc0, *(buf as *const u64)); + buf = buf.add(8); + len -= 8; + } + + while len > 0 { + crc0 = __crc32b(crc0, *buf); + buf = buf.add(1); + len -= 1; + } + + crc0 +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::test::consts::TEST_CHECK_STRING; + use crc::{Crc, Table}; + use rand::{rng, Rng}; + + const RUST_CRC32_ISO_HDLC: Crc> = + Crc::>::new(&crc::CRC_32_ISO_HDLC); + + const RUST_CRC32_ISCSI: Crc> = Crc::>::new(&crc::CRC_32_ISCSI); + + #[test] + fn test_crc32_iso_hdlc_check() { + assert_eq!( + crc32_iso_hdlc(0xffffffff, TEST_CHECK_STRING) ^ 0xffffffff, + 0xcbf43926 + ); + } + + #[test] + fn test_crc32_iso_hdlc_small_all_lengths() { + for len in 1..=255 { + crc32_iso_hdlc_random(len) + } + } + + #[test] + fn test_crc32_iso_hdlc_medium_lengths() { + // Test each length from 256 to 1024, which should fold and include handling remainders + for len in 256..=1024 { + crc32_iso_hdlc_random(len) + } + } + + #[test] + fn test_crc32_iso_hdlc_large_lengths() { + // Test 1 MiB just before, at, and just after the folding boundaries + for len in 1048575..1048577 { + crc32_iso_hdlc_random(len) + } + } + + #[test] + fn test_crc32_iscsi_check() { + assert_eq!( + crc32_iscsi(0xffffffff, TEST_CHECK_STRING) ^ 0xffffffff, + 0xe3069283 + ); + } + + #[test] + fn test_crc32_iscsi_small_all_lengths() { + for len in 1..=255 { + crc32_iscsi_random(len); + } + } + + #[test] + fn test_crc32_iscsi_medium_lengths() { + // Test each length from 256 to 1024, which should fold and include handling remainders + for len in 256..=1024 { + crc32_iscsi_random(len); + } + } + + #[test] + fn test_crc32_iscsi_large_lengths() { + // Test 1 MiB just before, at, and just after the folding boundaries + for len in 1048575..1048577 { + crc32_iscsi_random(len); + } + } + + #[cfg(target_feature = "sha3")] + fn crc32_iso_hdlc_random(len: usize) { + let mut data = vec![0u8; len]; + rng().fill(&mut data[..]); + + let checksum = RUST_CRC32_ISO_HDLC.checksum(&data); + + assert_eq!(crc32_iso_hdlc(0xffffffff, &data) ^ 0xffffffff, checksum); + + unsafe { + assert_eq!( + crc32_iso_hdlc_eor3_v9s3x2e_s3(0xffffffff, data.as_ptr(), data.len()) ^ 0xffffffff, + checksum + ); + + assert_eq!( + crc32_iso_hdlc_v12e_v1(0xffffffff, data.as_ptr(), data.len()) ^ 0xffffffff, + checksum + ); + } + } + + #[cfg(not(target_feature = "sha3"))] + fn crc32_iso_hdlc_random(len: usize) { + let mut data = vec![0u8; len]; + rng().fill(&mut data[..]); + + let checksum = RUST_CRC32_ISO_HDLC.checksum(&data); + + assert_eq!(crc32_iso_hdlc(0xffffffff, &data) ^ 0xffffffff, checksum); + + unsafe { + assert_eq!( + crc32_iso_hdlc_v12e_v1(0xffffffff, data.as_ptr(), data.len()) ^ 0xffffffff, + checksum + ); + } + } + + #[cfg(target_feature = "sha3")] + fn crc32_iscsi_random(len: usize) { + let mut data = vec![0u8; len]; + rng().fill(&mut data[..]); + + let checksum = RUST_CRC32_ISCSI.checksum(&data); + + assert_eq!(crc32_iscsi(0xffffffff, &data) ^ 0xffffffff, checksum); + + unsafe { + assert_eq!( + crc32_iscsi_eor3_v9s3x2e_s3(0xffffffff, data.as_ptr(), data.len()) ^ 0xffffffff, + checksum + ); + + assert_eq!( + crc32_iscsi_v12e_v1(0xffffffff, data.as_ptr(), data.len()) ^ 0xffffffff, + checksum + ); + } + } + + #[cfg(not(target_feature = "sha3"))] + fn crc32_iscsi_random(len: usize) { + let mut data = vec![0u8; len]; + rng().fill(&mut data[..]); + + let checksum = RUST_CRC32_ISCSI.checksum(&data); + + assert_eq!(crc32_iscsi(0xffffffff, &data) ^ 0xffffffff, checksum); + + unsafe { + assert_eq!( + crc32_iscsi_v12e_v1(0xffffffff, data.as_ptr(), data.len()) ^ 0xffffffff, + checksum + ); + } + } +} diff --git a/src/crc32/fusion/mod.rs b/src/crc32/fusion/mod.rs new file mode 100644 index 0000000..d75a64b --- /dev/null +++ b/src/crc32/fusion/mod.rs @@ -0,0 +1,34 @@ +// Copyright 2025 Don MacAskill. Licensed under MIT or Apache-2.0. + +//! This module provides support for calculating CRC-32/ISO-HDLC and CRC-32/ISCSI using +//! fusion techniques. +//! +//! https://www.corsix.org/content/fast-crc32c-4k +//! https://www.corsix.org/content/alternative-exposition-crc32_4k_pclmulqdq +//! https://dougallj.wordpress.com/2022/05/22/faster-crc32-on-the-apple-m1/ +//! https://github.com/corsix/fast-crc32/ + +mod aarch64; +mod x86; + +#[inline(always)] +#[allow(unused)] +pub(crate) fn crc32_iso_hdlc(state: u32, data: &[u8]) -> u32 { + #[cfg(target_arch = "aarch64")] + return aarch64::crc32_iso_hdlc(state, data); + + #[cfg(not(target_arch = "aarch64"))] + panic!("CRC-32/ISO-HDLC with fusion is only supported on AArch64 architecture"); +} + +#[inline(always)] +pub(crate) fn crc32_iscsi(state: u32, data: &[u8]) -> u32 { + #[cfg(target_arch = "aarch64")] + return aarch64::crc32_iscsi(state, data); + + #[cfg(target_arch = "x86_64")] + return x86::crc32_iscsi(state, data); + + #[cfg(all(not(target_arch = "aarch64"), not(target_arch = "x86_64")))] + panic!("CRC-32/ISCSI with fusion is only supported on AArch64 and X86_64 architectures"); +} diff --git a/src/crc32/fusion/x86.rs b/src/crc32/fusion/x86.rs new file mode 100644 index 0000000..8a68dbc --- /dev/null +++ b/src/crc32/fusion/x86.rs @@ -0,0 +1,748 @@ +//! Provides CRC-32/ISCSI and CRC-32/ISO-HDLC calculations using a fusion of native CLMUL +//! instructions and native CRC calculation instructions on x86_64. +//! +//! https://www.corsix.org/content/fast-crc32c-4k +//! https://www.corsix.org/content/alternative-exposition-crc32_4k_pclmulqdq +//! +//! Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/ +//! with the help of Claude.ai using: +//! +//! ./generate -i avx512_vpclmulqdq -p crc32c -a v3x2 +//! ./generate -i avx512 -p crc32c -a v4s3x3 +//! ./generate -i sse -p crc32c -a v4s3x3 +//! +//! Modified as necessary for this Rust implementation. +//! +//! MIT licensed. + +#![cfg(target_arch = "x86_64")] + +use std::arch::x86_64::*; + +/// Safe wrapper for CRC32 iSCSI calculation using AVX-512 +//#[rustversion::before(1.89)] +#[inline(always)] +#[cfg(not(feature = "vpclmulqdq"))] +pub fn crc32_iscsi(crc: u32, data: &[u8]) -> u32 { + unsafe { crc32_iscsi_sse_v4s3x3(crc, data.as_ptr(), data.len()) } +} + +//#[rustversion::since(1.89)] +#[inline(always)] +#[cfg(feature = "vpclmulqdq")] +pub fn crc32_iscsi(crc: u32, data: &[u8]) -> u32 { + if is_x86_feature_detected!("vpclmulqdq") + && is_x86_feature_detected!("avx512f") + && is_x86_feature_detected!("avx512vl") + { + unsafe { + return crc32_iscsi_avx512_vpclmulqdq_v3x2(crc, data.as_ptr(), data.len()); + } + } + + if is_x86_feature_detected!("avx512f") && is_x86_feature_detected!("avx512vl") { + unsafe { + return crc32_iscsi_avx512_v4s3x3(crc, data.as_ptr(), data.len()); + } + } + + unsafe { crc32_iscsi_sse_v4s3x3(crc, data.as_ptr(), data.len()) } +} + +//#[rustversion::since(1.89)] +#[inline] +#[cfg(feature = "vpclmulqdq")] +#[target_feature(enable = "avx512f,avx512vl,vpclmulqdq")] +unsafe fn clmul_lo_avx512_vpclmulqdq(a: __m512i, b: __m512i) -> __m512i { + _mm512_clmulepi64_epi128(a, b, 0) +} + +//#[rustversion::since(1.89)] +#[inline] +#[cfg(feature = "vpclmulqdq")] +#[target_feature(enable = "avx512f,avx512vl,vpclmulqdq")] +unsafe fn clmul_hi_avx512_vpclmulqdq(a: __m512i, b: __m512i) -> __m512i { + _mm512_clmulepi64_epi128(a, b, 17) +} + +#[inline] +#[target_feature(enable = "pclmulqdq")] +unsafe fn clmul_lo_sse(a: __m128i, b: __m128i) -> __m128i { + _mm_clmulepi64_si128(a, b, 0) +} + +#[inline] +#[target_feature(enable = "pclmulqdq")] +unsafe fn clmul_hi_sse(a: __m128i, b: __m128i) -> __m128i { + _mm_clmulepi64_si128(a, b, 17) +} + +#[inline] +#[target_feature(enable = "pclmulqdq")] +unsafe fn clmul_scalar_sse(a: u32, b: u32) -> __m128i { + _mm_clmulepi64_si128(_mm_cvtsi32_si128(a as i32), _mm_cvtsi32_si128(b as i32), 0) +} + +// x^n mod P, in log(n) time +#[target_feature(enable = "sse4.2,pclmulqdq")] +unsafe fn xnmodp_iscsi_sse(mut n: u64) -> u32 { + let mut stack = !1u64; + let mut acc: u32; + let mut low: u32; + + while n > 191 { + stack = (stack << 1) + (n & 1); + n = (n >> 1) - 16; + } + stack = !stack; + acc = 0x80000000u32 >> (n & 31); + n >>= 5; + + while n > 0 { + // Use hardware CRC32C instruction + acc = _mm_crc32_u32(acc, 0); + n -= 1; + } + + while { + low = (stack & 1) as u32; + stack >>= 1; + stack != 0 + } { + let x = _mm_cvtsi32_si128(acc as i32); + let y = _mm_cvtsi128_si64(_mm_clmulepi64_si128(x, x, 0)) as u64; + acc = _mm_crc32_u64(0, y << low) as u32; + } + acc +} + +#[inline] +#[target_feature(enable = "pclmulqdq")] +unsafe fn crc_shift_iscsi_sse(crc: u32, nbytes: usize) -> __m128i { + clmul_scalar_sse(crc, xnmodp_iscsi_sse((nbytes * 8 - 33) as u64)) +} + +#[inline] +#[target_feature(enable = "sse4.1")] +unsafe fn mm_extract_epi64(val: __m128i, idx: i32) -> u64 { + if idx == 0 { + _mm_cvtsi128_si64(val) as u64 + } else { + _mm_cvtsi128_si64(_mm_srli_si128(val, 8)) as u64 + } +} + +#[inline] +#[target_feature(enable = "sse4.2")] +unsafe fn mm_crc32_u64(crc: u32, val: u64) -> u32 { + _mm_crc32_u64(crc.into(), val) as u32 +} + +/// Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/ +/// using: +/// +/// ./generate -i avx512_vpclmulqdq -p crc32c -a v3x2 +//#[rustversion::since(1.89)] +#[inline] +#[cfg(feature = "vpclmulqdq")] +#[target_feature(enable = "avx512f,avx512vl,vpclmulqdq,sse4.2")] +pub unsafe fn crc32_iscsi_avx512_vpclmulqdq_v3x2( + mut crc0: u32, + mut buf: *const u8, + mut len: usize, +) -> u32 { + // Align to 8-byte boundary + while len > 0 && (buf as usize & 7) != 0 { + crc0 = _mm_crc32_u8(crc0, *buf); + buf = buf.add(1); + len -= 1; + } + + // Align to 64-byte boundary (cache line) + while (buf as usize & 56) != 0 && len >= 8 { + crc0 = _mm_crc32_u64(crc0.into(), *(buf as *const u64)) as u32; + buf = buf.add(8); + len -= 8; + } + + if len >= 384 { + // First vector chunk - load three 512-bit vectors (192 bytes total) + let mut x0 = _mm512_loadu_si512(buf as *const __m512i); + let mut x1 = _mm512_loadu_si512(buf.add(64) as *const __m512i); + let mut x2 = _mm512_loadu_si512(buf.add(128) as *const __m512i); + + // Create the multiplication constant vector + // Pattern: [0xa87ab8a8, 0, 0xab7aff2a, 0] repeated across all 128-bit lanes + let k_128 = _mm_setr_epi32(0xa87ab8a8u32 as i32, 0, 0xab7aff2au32 as i32, 0); + let mut k = _mm512_broadcast_i32x4(k_128); + + // XOR the CRC into the first vector's low 32 bits + let crc_vec = _mm512_castsi128_si512(_mm_cvtsi32_si128(crc0 as i32)); + x0 = _mm512_xor_si512(crc_vec, x0); + + // First round of polynomial multiplication + let mut y0 = clmul_lo_avx512_vpclmulqdq(x0, k); + x0 = clmul_hi_avx512_vpclmulqdq(x0, k); + let mut y1 = clmul_lo_avx512_vpclmulqdq(x1, k); + x1 = clmul_hi_avx512_vpclmulqdq(x1, k); + let mut y2 = clmul_lo_avx512_vpclmulqdq(x2, k); + x2 = clmul_hi_avx512_vpclmulqdq(x2, k); + + // XOR with next chunk of data using ternary logic (A XOR B XOR C) + // 0x96 = A XOR B XOR C in ternary logic notation + x0 = _mm512_ternarylogic_epi64( + x0, + y0, + _mm512_loadu_si512(buf.add(192) as *const __m512i), + 0x96, + ); + x1 = _mm512_ternarylogic_epi64( + x1, + y1, + _mm512_loadu_si512(buf.add(256) as *const __m512i), + 0x96, + ); + x2 = _mm512_ternarylogic_epi64( + x2, + y2, + _mm512_loadu_si512(buf.add(320) as *const __m512i), + 0x96, + ); + + buf = buf.add(384); + len -= 384; + + // Main loop - process 384 bytes at a time + while len >= 384 { + // First folding step + y0 = clmul_lo_avx512_vpclmulqdq(x0, k); + x0 = clmul_hi_avx512_vpclmulqdq(x0, k); + y1 = clmul_lo_avx512_vpclmulqdq(x1, k); + x1 = clmul_hi_avx512_vpclmulqdq(x1, k); + y2 = clmul_lo_avx512_vpclmulqdq(x2, k); + x2 = clmul_hi_avx512_vpclmulqdq(x2, k); + + x0 = _mm512_ternarylogic_epi64(x0, y0, _mm512_loadu_si512(buf as *const __m512i), 0x96); + x1 = _mm512_ternarylogic_epi64( + x1, + y1, + _mm512_loadu_si512(buf.add(64) as *const __m512i), + 0x96, + ); + x2 = _mm512_ternarylogic_epi64( + x2, + y2, + _mm512_loadu_si512(buf.add(128) as *const __m512i), + 0x96, + ); + + // Second folding step + y0 = clmul_lo_avx512_vpclmulqdq(x0, k); + x0 = clmul_hi_avx512_vpclmulqdq(x0, k); + y1 = clmul_lo_avx512_vpclmulqdq(x1, k); + x1 = clmul_hi_avx512_vpclmulqdq(x1, k); + y2 = clmul_lo_avx512_vpclmulqdq(x2, k); + x2 = clmul_hi_avx512_vpclmulqdq(x2, k); + + x0 = _mm512_ternarylogic_epi64( + x0, + y0, + _mm512_loadu_si512(buf.add(192) as *const __m512i), + 0x96, + ); + x1 = _mm512_ternarylogic_epi64( + x1, + y1, + _mm512_loadu_si512(buf.add(256) as *const __m512i), + 0x96, + ); + x2 = _mm512_ternarylogic_epi64( + x2, + y2, + _mm512_loadu_si512(buf.add(320) as *const __m512i), + 0x96, + ); + + buf = buf.add(384); + len -= 384; + } + + // Reduce x0, x1, x2 to just x0 + let k_128 = _mm_setr_epi32(0x740eef02u32 as i32, 0, 0x9e4addf8u32 as i32, 0); + k = _mm512_broadcast_i32x4(k_128); + + y0 = clmul_lo_avx512_vpclmulqdq(x0, k); + x0 = clmul_hi_avx512_vpclmulqdq(x0, k); + x0 = _mm512_ternarylogic_epi64(x0, y0, x1, 0x96); + x1 = x2; + + y0 = clmul_lo_avx512_vpclmulqdq(x0, k); + x0 = clmul_hi_avx512_vpclmulqdq(x0, k); + x0 = _mm512_ternarylogic_epi64(x0, y0, x1, 0x96); + + // Reduce 512 bits to 128 bits + // Multiple reduction constants for different parts of the 512-bit vector + k = _mm512_setr_epi32( + 0x1c291d04u32 as i32, + 0, + 0xddc0152bu32 as i32, + 0, // Lane 0 + 0x3da6d0cbu32 as i32, + 0, + 0xba4fc28eu32 as i32, + 0, // Lane 1 + 0xf20c0dfeu32 as i32, + 0, + 0x493c7d27u32 as i32, + 0, // Lane 2 + 0, + 0, + 0, + 0, // Lane 3 (unused) + ); + + y0 = clmul_lo_avx512_vpclmulqdq(x0, k); + k = clmul_hi_avx512_vpclmulqdq(x0, k); + y0 = _mm512_xor_si512(y0, k); + + // Extract 128-bit lanes and combine them + let lane0 = _mm512_castsi512_si128(y0); + let lane1 = _mm512_extracti32x4_epi32(y0, 1); + let lane2 = _mm512_extracti32x4_epi32(y0, 2); + let lane3 = _mm512_extracti32x4_epi32(x0, 3); + + // Combine all lanes using ternary logic + let mut z0 = _mm_ternarylogic_epi64(lane0, lane1, lane2, 0x96); + z0 = _mm_xor_si128(z0, lane3); + + // Reduce 128 bits to 32 bits using CRC32 instructions + crc0 = _mm_crc32_u64(0, _mm_extract_epi64(z0, 0) as u64) as u32; + crc0 = _mm_crc32_u64(crc0.into(), _mm_extract_epi64(z0, 1) as u64) as u32; + } + + // Process remaining 8-byte chunks + while len >= 8 { + crc0 = _mm_crc32_u64(crc0.into(), *(buf as *const u64)) as u32; + buf = buf.add(8); + len -= 8; + } + + // Process remaining bytes + while len > 0 { + crc0 = _mm_crc32_u8(crc0, *buf); + buf = buf.add(1); + len -= 1; + } + + crc0 +} + +/// Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/ +/// using: +/// +/// ./generate -i avx512 -p crc32c -a v4s3x3 +//#[rustversion::since(1.89)] +#[inline] +#[cfg(feature = "vpclmulqdq")] +#[target_feature(enable = "avx2,avx512f,avx512vl,pclmulqdq")] +pub unsafe fn crc32_iscsi_avx512_v4s3x3(mut crc0: u32, mut buf: *const u8, mut len: usize) -> u32 { + // Align to 8-byte boundary using hardware CRC32C instructions + while len > 0 && (buf as usize & 7) != 0 { + crc0 = _mm_crc32_u8(crc0, *buf); + buf = buf.add(1); + len -= 1; + } + + // Handle 8-byte alignment + if (buf as usize & 8) != 0 && len >= 8 { + crc0 = _mm_crc32_u64(crc0.into(), *(buf as *const u64)) as u32; + buf = buf.add(8); + len -= 8; + } + + if len >= 144 { + let blk = (len - 8) / 136; + let klen = blk * 24; + let buf2 = buf; + let mut crc1 = 0u32; + let mut crc2 = 0u32; + + // First vector chunk - load four 128-bit vectors (64 bytes total) + let mut x0 = _mm_loadu_si128(buf2 as *const __m128i); + let mut x1 = _mm_loadu_si128(buf2.add(16) as *const __m128i); + let mut x2 = _mm_loadu_si128(buf2.add(32) as *const __m128i); + let mut x3 = _mm_loadu_si128(buf2.add(48) as *const __m128i); + + // iSCSI-specific folding constant (different from ISO-HDLC) + let mut k = _mm_setr_epi32(0x740eef02u32 as i32, 0, 0x9e4addf8u32 as i32, 0); + + // XOR the CRC into the first vector's low 32 bits + x0 = _mm_xor_si128(_mm_cvtsi32_si128(crc0 as i32), x0); + crc0 = 0; + + let mut buf2 = buf2.add(64); + len -= 136; + buf = buf.add(blk * 64); + + // Main loop - process 144 bytes at a time + while len >= 144 { + let y0 = clmul_lo_sse(x0, k); + x0 = clmul_hi_sse(x0, k); + let y1 = clmul_lo_sse(x1, k); + x1 = clmul_hi_sse(x1, k); + let y2 = clmul_lo_sse(x2, k); + x2 = clmul_hi_sse(x2, k); + let y3 = clmul_lo_sse(x3, k); + x3 = clmul_hi_sse(x3, k); + + // XOR with next chunk of data using ternary logic (A XOR B XOR C) + x0 = _mm_ternarylogic_epi64(x0, y0, _mm_loadu_si128(buf2 as *const __m128i), 0x96); + x1 = _mm_ternarylogic_epi64( + x1, + y1, + _mm_loadu_si128(buf2.add(16) as *const __m128i), + 0x96, + ); + x2 = _mm_ternarylogic_epi64( + x2, + y2, + _mm_loadu_si128(buf2.add(32) as *const __m128i), + 0x96, + ); + x3 = _mm_ternarylogic_epi64( + x3, + y3, + _mm_loadu_si128(buf2.add(48) as *const __m128i), + 0x96, + ); + + // Process scalar data in parallel using hardware CRC32C + crc0 = _mm_crc32_u64(crc0.into(), *(buf as *const u64)) as u32; + crc1 = _mm_crc32_u64(crc1.into(), *(buf.add(klen) as *const u64)) as u32; + crc2 = _mm_crc32_u64(crc2.into(), *(buf.add(klen * 2) as *const u64)) as u32; + crc0 = _mm_crc32_u64(crc0.into(), *(buf.add(8) as *const u64)) as u32; + crc1 = _mm_crc32_u64(crc1.into(), *(buf.add(klen + 8) as *const u64)) as u32; + crc2 = _mm_crc32_u64(crc2.into(), *(buf.add(klen * 2 + 8) as *const u64)) as u32; + crc0 = _mm_crc32_u64(crc0.into(), *(buf.add(16) as *const u64)) as u32; + crc1 = _mm_crc32_u64(crc1.into(), *(buf.add(klen + 16) as *const u64)) as u32; + crc2 = _mm_crc32_u64(crc2.into(), *(buf.add(klen * 2 + 16) as *const u64)) as u32; + + buf = buf.add(24); + buf2 = buf2.add(64); + len -= 136; + } + + // Reduce x0 ... x3 to just x0 using iSCSI-specific constants + k = _mm_setr_epi32(0xf20c0dfeu32 as i32, 0, 0x493c7d27u32 as i32, 0); + + let y0 = clmul_lo_sse(x0, k); + x0 = clmul_hi_sse(x0, k); + let y2 = clmul_lo_sse(x2, k); + x2 = clmul_hi_sse(x2, k); + + x0 = _mm_ternarylogic_epi64(x0, y0, x1, 0x96); + x2 = _mm_ternarylogic_epi64(x2, y2, x3, 0x96); + + k = _mm_setr_epi32(0x3da6d0cbu32 as i32, 0, 0xba4fc28eu32 as i32, 0); + + let y0 = clmul_lo_sse(x0, k); + x0 = clmul_hi_sse(x0, k); + x0 = _mm_ternarylogic_epi64(x0, y0, x2, 0x96); + + // Final scalar chunk + crc0 = _mm_crc32_u64(crc0.into(), *(buf as *const u64)) as u32; + crc1 = _mm_crc32_u64(crc1.into(), *(buf.add(klen) as *const u64)) as u32; + crc2 = _mm_crc32_u64(crc2.into(), *(buf.add(klen * 2) as *const u64)) as u32; + crc0 = _mm_crc32_u64(crc0.into(), *(buf.add(8) as *const u64)) as u32; + crc1 = _mm_crc32_u64(crc1.into(), *(buf.add(klen + 8) as *const u64)) as u32; + crc2 = _mm_crc32_u64(crc2.into(), *(buf.add(klen * 2 + 8) as *const u64)) as u32; + crc0 = _mm_crc32_u64(crc0.into(), *(buf.add(16) as *const u64)) as u32; + crc1 = _mm_crc32_u64(crc1.into(), *(buf.add(klen + 16) as *const u64)) as u32; + crc2 = _mm_crc32_u64(crc2.into(), *(buf.add(klen * 2 + 16) as *const u64)) as u32; + buf = buf.add(24); + + let vc0 = crc_shift_iscsi_sse(crc0, klen * 2 + 8); + let vc1 = crc_shift_iscsi_sse(crc1, klen + 8); + let mut vc = _mm_extract_epi64(_mm_xor_si128(vc0, vc1), 0) as u64; + + // Reduce 128 bits to 32 bits, and multiply by x^32 + let x0_low = _mm_extract_epi64(x0, 0) as u64; + let x0_high = _mm_extract_epi64(x0, 1) as u64; + let vec_crc = _mm_crc32_u64(_mm_crc32_u64(0, x0_low), x0_high); + vc ^= _mm_extract_epi64(crc_shift_iscsi_sse(vec_crc as u32, klen * 3 + 8), 0) as u64; + + // Final 8 bytes + buf = buf.add(klen * 2); + crc0 = crc2; + crc0 = _mm_crc32_u64(crc0.into(), *(buf as *const u64) ^ vc) as u32; + buf = buf.add(8); + len -= 8; + } + + // Process remaining 8-byte chunks using hardware CRC32C + while len >= 8 { + crc0 = _mm_crc32_u64(crc0.into(), *(buf as *const u64)) as u32; + buf = buf.add(8); + len -= 8; + } + + // Process remaining bytes using hardware CRC32C + while len > 0 { + crc0 = _mm_crc32_u8(crc0, *buf); + buf = buf.add(1); + len -= 1; + } + + crc0 +} + +/// Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/ +/// using: +/// +/// ./generate -i sse -p crc32c -a v4s3x3 +#[inline] +#[target_feature(enable = "sse4.2,pclmulqdq")] +pub unsafe fn crc32_iscsi_sse_v4s3x3(mut crc0: u32, mut buf: *const u8, mut len: usize) -> u32 { + // Align to 8-byte boundary using hardware CRC32C instructions + while len > 0 && (buf as usize & 7) != 0 { + crc0 = _mm_crc32_u8(crc0, *buf); + buf = buf.add(1); + len -= 1; + } + + // Handle 8-byte alignment + if (buf as usize & 8) != 0 && len >= 8 { + crc0 = mm_crc32_u64(crc0, *(buf as *const u64)); + buf = buf.add(8); + len -= 8; + } + + if len >= 144 { + let blk = (len - 8) / 136; + let klen = blk * 24; + let buf2 = buf; + let mut crc1 = 0u32; + let mut crc2 = 0u32; + + // First vector chunk - load four 128-bit vectors (64 bytes total) + let mut x0 = _mm_loadu_si128(buf2 as *const __m128i); + let mut x1 = _mm_loadu_si128(buf2.add(16) as *const __m128i); + let mut x2 = _mm_loadu_si128(buf2.add(32) as *const __m128i); + let mut x3 = _mm_loadu_si128(buf2.add(48) as *const __m128i); + + // iSCSI-specific folding constant (same as AVX-512 version) + let mut k = _mm_setr_epi32(0x740eef02u32 as i32, 0, 0x9e4addf8u32 as i32, 0); + + // XOR the CRC into the first vector's low 32 bits + x0 = _mm_xor_si128(_mm_cvtsi32_si128(crc0 as i32), x0); + crc0 = 0; + + let mut buf2 = buf2.add(64); + len -= 136; + buf = buf.add(blk * 64); + + // Main loop - process 144 bytes at a time + while len >= 144 { + let mut y0 = clmul_lo_sse(x0, k); + x0 = clmul_hi_sse(x0, k); + let mut y1 = clmul_lo_sse(x1, k); + x1 = clmul_hi_sse(x1, k); + let mut y2 = clmul_lo_sse(x2, k); + x2 = clmul_hi_sse(x2, k); + let mut y3 = clmul_lo_sse(x3, k); + x3 = clmul_hi_sse(x3, k); + + // XOR operations using separate XOR instructions (no ternary logic in SSE) + y0 = _mm_xor_si128(y0, _mm_loadu_si128(buf2 as *const __m128i)); + x0 = _mm_xor_si128(x0, y0); + y1 = _mm_xor_si128(y1, _mm_loadu_si128(buf2.add(16) as *const __m128i)); + x1 = _mm_xor_si128(x1, y1); + y2 = _mm_xor_si128(y2, _mm_loadu_si128(buf2.add(32) as *const __m128i)); + x2 = _mm_xor_si128(x2, y2); + y3 = _mm_xor_si128(y3, _mm_loadu_si128(buf2.add(48) as *const __m128i)); + x3 = _mm_xor_si128(x3, y3); + + // Process scalar data in parallel using hardware CRC32C + crc0 = mm_crc32_u64(crc0, *(buf as *const u64)); + crc1 = mm_crc32_u64(crc1, *(buf.add(klen) as *const u64)); + crc2 = mm_crc32_u64(crc2, *(buf.add(klen * 2) as *const u64)); + crc0 = mm_crc32_u64(crc0, *(buf.add(8) as *const u64)); + crc1 = mm_crc32_u64(crc1, *(buf.add(klen + 8) as *const u64)); + crc2 = mm_crc32_u64(crc2, *(buf.add(klen * 2 + 8) as *const u64)); + crc0 = mm_crc32_u64(crc0, *(buf.add(16) as *const u64)); + crc1 = mm_crc32_u64(crc1, *(buf.add(klen + 16) as *const u64)); + crc2 = mm_crc32_u64(crc2, *(buf.add(klen * 2 + 16) as *const u64)); + + buf = buf.add(24); + buf2 = buf2.add(64); + len -= 136; + } + + // Reduce x0 ... x3 to just x0 using iSCSI-specific constants + k = _mm_setr_epi32(0xf20c0dfeu32 as i32, 0, 0x493c7d27u32 as i32, 0); + + let mut y0 = clmul_lo_sse(x0, k); + x0 = clmul_hi_sse(x0, k); + let mut y2 = clmul_lo_sse(x2, k); + x2 = clmul_hi_sse(x2, k); + + y0 = _mm_xor_si128(y0, x1); + x0 = _mm_xor_si128(x0, y0); + y2 = _mm_xor_si128(y2, x3); + x2 = _mm_xor_si128(x2, y2); + + k = _mm_setr_epi32(0x3da6d0cbu32 as i32, 0, 0xba4fc28eu32 as i32, 0); + + y0 = clmul_lo_sse(x0, k); + x0 = clmul_hi_sse(x0, k); + y0 = _mm_xor_si128(y0, x2); + x0 = _mm_xor_si128(x0, y0); + + // Final scalar chunk + crc0 = mm_crc32_u64(crc0, *(buf as *const u64)); + crc1 = mm_crc32_u64(crc1, *(buf.add(klen) as *const u64)); + crc2 = mm_crc32_u64(crc2, *(buf.add(klen * 2) as *const u64)); + crc0 = mm_crc32_u64(crc0, *(buf.add(8) as *const u64)); + crc1 = mm_crc32_u64(crc1, *(buf.add(klen + 8) as *const u64)); + crc2 = mm_crc32_u64(crc2, *(buf.add(klen * 2 + 8) as *const u64)); + crc0 = mm_crc32_u64(crc0, *(buf.add(16) as *const u64)); + crc1 = mm_crc32_u64(crc1, *(buf.add(klen + 16) as *const u64)); + crc2 = mm_crc32_u64(crc2, *(buf.add(klen * 2 + 16) as *const u64)); + buf = buf.add(24); + + let vc0 = crc_shift_iscsi_sse(crc0, klen * 2 + 8); + let vc1 = crc_shift_iscsi_sse(crc1, klen + 8); + let mut vc = mm_extract_epi64(_mm_xor_si128(vc0, vc1), 0); + + // Reduce 128 bits to 32 bits, and multiply by x^32 + // Extract the two 64-bit parts of x0 and combine them + let x0_low = mm_extract_epi64(x0, 0); + let x0_high = mm_extract_epi64(x0, 1); + let x0_combined = mm_extract_epi64( + crc_shift_iscsi_sse(mm_crc32_u64(mm_crc32_u64(0, x0_low), x0_high), klen * 3 + 8), + 0, + ); + vc ^= x0_combined; + + // Final 8 bytes + buf = buf.add(klen * 2); + crc0 = crc2; + crc0 = mm_crc32_u64(crc0, *(buf as *const u64) ^ vc); + buf = buf.add(8); + len -= 8; + } + + // Process remaining 8-byte chunks using hardware CRC32C + while len >= 8 { + crc0 = mm_crc32_u64(crc0, *(buf as *const u64)); + buf = buf.add(8); + len -= 8; + } + + // Process remaining bytes using hardware CRC32C + while len > 0 { + crc0 = _mm_crc32_u8(crc0, *buf); + buf = buf.add(1); + len -= 1; + } + + crc0 +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::test::consts::TEST_CHECK_STRING; + use crc::{Crc, Table}; + use rand::{rng, Rng}; + + const RUST_CRC32_ISCSI: Crc> = Crc::>::new(&crc::CRC_32_ISCSI); + + #[test] + fn test_crc32_iscsi_check() { + assert_eq!( + crc32_iscsi(0xffffffff, TEST_CHECK_STRING) ^ 0xffffffff, + 0xe3069283 + ); + } + + #[test] + fn test_crc32_iscsi_small_all_lengths() { + for len in 1..=255 { + test_crc32_iscsi_random(len); + } + } + + #[test] + fn test_crc32_iscsi_medium_lengths() { + // Test each length from 256 to 1024, which should fold and include handling remainders + for len in 256..=1024 { + test_crc32_iscsi_random(len); + } + } + + #[test] + fn test_crc32_iscsi_large_lengths() { + // Test 1 MiB just before, at, and just after the folding boundaries + for len in 1048575..1048577 { + test_crc32_iscsi_random(len); + } + } + + //#[rustversion::since(1.89)] + #[cfg(feature = "vpclmulqdq")] + fn test_crc32_iscsi_random(len: usize) { + let mut data = vec![0u8; len]; + rng().fill(&mut data[..]); + + let checksum = RUST_CRC32_ISCSI.checksum(&data); + + assert_eq!(crc32_iscsi(0xffffffff, &data) ^ 0xffffffff, checksum); + + unsafe { + if is_x86_feature_detected!("vpclmulqdq") + && is_x86_feature_detected!("avx512vl") + && is_x86_feature_detected!("avx512f") + { + assert_eq!( + crc32_iscsi_avx512_vpclmulqdq_v3x2(0xffffffff, data.as_ptr(), data.len()) + ^ 0xffffffff, + checksum + ); + } + + if is_x86_feature_detected!("avx512vl") + && is_x86_feature_detected!("avx512f") + && is_x86_feature_detected!("pclmulqdq") + { + assert_eq!( + crc32_iscsi_avx512_v4s3x3(0xffffffff, data.as_ptr(), data.len()) ^ 0xffffffff, + checksum + ); + } + + assert_eq!( + crc32_iscsi_sse_v4s3x3(0xffffffff, data.as_ptr(), data.len()) ^ 0xffffffff, + checksum + ); + } + } + + //#[rustversion::before(1.89)] + #[cfg(not(feature = "vpclmulqdq"))] + fn test_crc32_iscsi_random(len: usize) { + let mut data = vec![0u8; len]; + rng().fill(&mut data[..]); + + let checksum = RUST_CRC32_ISCSI.checksum(&data); + + assert_eq!(crc32_iscsi(0xffffffff, &data) ^ 0xffffffff, checksum); + + unsafe { + assert_eq!( + crc32_iscsi_sse_v4s3x3(0xffffffff, data.as_ptr(), data.len()) ^ 0xffffffff, + checksum + ); + } + } +} diff --git a/src/crc32/mod.rs b/src/crc32/mod.rs index fba820b..518f5f2 100644 --- a/src/crc32/mod.rs +++ b/src/crc32/mod.rs @@ -1,6 +1,9 @@ // Copyright 2025 Don MacAskill. Licensed under MIT or Apache-2.0. -//! This module provides CRC32 support. +//! This module provides CRC-32 support. pub mod algorithm; pub mod consts; + +#[cfg(any(target_arch = "x86_64", target_arch = "aarch64"))] +pub(crate) mod fusion; diff --git a/src/crc64/algorithm.rs b/src/crc64/algorithm.rs index 49cd301..5900585 100644 --- a/src/crc64/algorithm.rs +++ b/src/crc64/algorithm.rs @@ -206,9 +206,9 @@ impl EnhancedCrcWidth for crate::structs::Width64 { #[inline] #[cfg_attr( any(target_arch = "x86", target_arch = "x86_64"), - target_feature(enable = "sse2,sse4.1,pclmulqdq") + target_feature(enable = "ssse3,sse4.1,pclmulqdq") )] -#[cfg_attr(target_arch = "aarch64", target_feature(enable = "neon,aes"))] +#[cfg_attr(target_arch = "aarch64", target_feature(enable = "aes"))] pub(crate) unsafe fn process_0_to_15( data: &[u8], state: &mut CrcState, diff --git a/src/crc64/mod.rs b/src/crc64/mod.rs index 4f86b2c..44eee30 100644 --- a/src/crc64/mod.rs +++ b/src/crc64/mod.rs @@ -1,6 +1,6 @@ // Copyright 2025 Don MacAskill. Licensed under MIT or Apache-2.0. -//! This module provides CRC64 support. +//! This module provides CRC-64 support. pub mod algorithm; pub mod consts; diff --git a/src/crc64/utils.rs b/src/crc64/utils.rs index 8aa79fa..6e5d58b 100644 --- a/src/crc64/utils.rs +++ b/src/crc64/utils.rs @@ -15,7 +15,7 @@ use std::arch::x86_64::*; #[cfg(target_arch = "aarch64")] #[allow(dead_code)] -#[target_feature(enable = "neon,aes")] +#[target_feature(enable = "aes")] pub(crate) unsafe fn print_xmm_hex(prefix: &str, xmm: uint8x16_t) { let mut temp = [0u64; 2]; vst1q_u64(temp.as_mut_ptr(), vreinterpretq_u64_u8(xmm)); @@ -24,7 +24,7 @@ pub(crate) unsafe fn print_xmm_hex(prefix: &str, xmm: uint8x16_t) { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[allow(dead_code)] -#[target_feature(enable = "sse2,sse4.1")] +#[target_feature(enable = "sse4.1")] pub(crate) unsafe fn print_xmm_hex(prefix: &str, xmm: __m128i) { let mut temp = [0u64; 2]; _mm_storeu_si128(temp.as_mut_ptr() as *mut __m128i, xmm); diff --git a/src/lib.rs b/src/lib.rs index 6b66cc5..b4f7a8e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -60,8 +60,8 @@ //! use crc_fast::{Digest, CrcAlgorithm::Crc32IsoHdlc}; //! //! // for example/test purposes only, use your own file path -//! let binding = env::current_dir().expect("missing working dir").join("crc-check.txt"); -//! let file_on_disk = binding.to_str().unwrap(); +//! let file_path = env::current_dir().expect("missing working dir").join("crc-check.txt"); +//! let file_on_disk = file_path.to_str().unwrap(); //! //! // actual usage //! let mut digest = Digest::new(Crc32IsoHdlc); @@ -97,24 +97,22 @@ //! use crc_fast::{checksum_file, CrcAlgorithm::Crc32IsoHdlc}; //! //! // for example/test purposes only, use your own file path -//! let binding = env::current_dir().expect("missing working dir").join("crc-check.txt"); -//! let file_on_disk = binding.to_str().unwrap(); +//! let file_path = env::current_dir().expect("missing working dir").join("crc-check.txt"); +//! let file_on_disk = file_path.to_str().unwrap(); //! //! let checksum = checksum_file(Crc32IsoHdlc, file_on_disk, None); //! //! assert_eq!(checksum.unwrap(), 0xcbf43926); //! ``` -// if VPCLMULQDQ or AVX512 is enabled, enable extra AVX512 features -#![cfg_attr( - any(feature = "vpclmulqdq", feature = "avx512"), - feature(stdarch_x86_avx512) -)] - use crate::crc32::consts::{ CRC32_AIXM, CRC32_AUTOSAR, CRC32_BASE91_D, CRC32_BZIP2, CRC32_CD_ROM_EDC, CRC32_CKSUM, CRC32_ISCSI, CRC32_ISO_HDLC, CRC32_JAMCRC, CRC32_MEF, CRC32_MPEG_2, CRC32_XFER, }; + +#[cfg(any(target_arch = "x86_64", target_arch = "aarch64"))] +use crate::crc32::fusion; + use crate::crc64::consts::{ CRC64_ECMA_182, CRC64_GO_ISO, CRC64_MS, CRC64_NVME, CRC64_REDIS, CRC64_WE, CRC64_XZ, }; @@ -126,7 +124,6 @@ use std::io::{Read, Write}; mod algorithm; mod arch; -mod bindings; mod combine; mod consts; mod crc32; @@ -379,8 +376,8 @@ pub fn checksum(algorithm: CrcAlgorithm, buf: &[u8]) -> u64 { /// use crc_fast::{checksum_file, CrcAlgorithm::Crc32IsoHdlc}; /// /// // for example/test purposes only, use your own file path -/// let binding = env::current_dir().expect("missing working dir").join("crc-check.txt"); -/// let file_on_disk = binding.to_str().unwrap(); +/// let file_path = env::current_dir().expect("missing working dir").join("crc-check.txt"); +/// let file_on_disk = file_path.to_str().unwrap(); /// /// let checksum = checksum_file(Crc32IsoHdlc, file_on_disk, None); /// @@ -440,32 +437,17 @@ pub fn checksum_combine( /// Returns the target used to calculate the CRC checksum for the specified algorithm. /// +/// These strings are informational only, not stable, and shouldn't be relied on to match across +/// versions. +/// /// # Examples ///```rust /// use crc_fast::{get_calculator_target, CrcAlgorithm::Crc32IsoHdlc}; /// /// let target = get_calculator_target(Crc32IsoHdlc); /// ``` -pub fn get_calculator_target(algorithm: CrcAlgorithm) -> String { - match algorithm { - CrcAlgorithm::Crc32IsoHdlc => { - #[cfg(optimized_crc32_iso_hdlc)] - unsafe { - bindings::get_iso_hdlc_target() - } - #[cfg(not(optimized_crc32_iso_hdlc))] - arch::get_target() - } - CrcAlgorithm::Crc32Iscsi => { - #[cfg(optimized_crc32_iscsi)] - unsafe { - bindings::get_iscsi_target() - } - #[cfg(not(optimized_crc32_iscsi))] - arch::get_target() - } - _ => arch::get_target(), - } +pub fn get_calculator_target(_algorithm: CrcAlgorithm) -> String { + arch::get_target() } /// Returns the calculator function and parameters for the specified CRC algorithm. @@ -496,53 +478,34 @@ fn get_calculator_params(algorithm: CrcAlgorithm) -> (CalculatorFn, CrcParams) { /// Calculates the CRC-32/ISCSI ("crc32c" in many, but not all, implementations) checksum. /// -/// By default, uses an external optimized C implementation, but can be switched to an internal -/// SIMD-only implementation by using the `internal_simd_only` feature flag. -/// -/// The external optimized implementation is also tunable via feature flags. +/// Because both aarch64 and x86 have native hardware support for CRC-32/ISCSI, we can use +/// fusion techniques to accelerate the calculation beyond what SIMD can do alone. #[inline(always)] -fn crc32_iscsi_calculator(state: u64, data: &[u8], params: CrcParams) -> u64 { - #[cfg(optimized_crc32_iscsi)] - { - bindings::crc32_iscsi(state, data, params) - } - - #[cfg(not(optimized_crc32_iscsi))] - { - Calculator::calculate(state, data, params) - } +fn crc32_iscsi_calculator(state: u64, data: &[u8], _params: CrcParams) -> u64 { + // both aarch64 and x86 have native CRC-32/ISCSI support, so we can use fusion + #[cfg(any(target_arch = "aarch64", target_arch = "x86_64"))] + return fusion::crc32_iscsi(state as u32, data) as u64; + + #[cfg(all(not(target_arch = "aarch64"), not(target_arch = "x86_64")))] + // fallback to traditional calculation if not aarch64 or x86_64 + Calculator::calculate(state, data, _params) } /// Calculates the CRC-32/ISO-HDLC ("crc32" in many, but not all, implementations) checksum. /// -/// By default, uses an external optimized C implementation, but can be switched to an internal -/// SIMD-only implementation by using the `internal_simd_only` feature flag. -/// -/// The external optimized implementation is also tunable via feature flags.#[inline(always)] -fn crc32_iso_hdlc_calculator(state: u64, data: &[u8], params: CrcParams) -> u64 { - #[cfg(optimized_crc32_iso_hdlc)] - { - // Call the FFI function for CRC-32/ISO-HDLC for large (>1KiB) data payloads - #[cfg(target_arch = "x86_64")] - { - if data.len() > 1024 && std::arch::is_x86_feature_detected!("vpclmulqdq") { - return bindings::crc32_iso_hdlc(state, data, params); - } - - // our internal SIMD implementation for small (<1KiB) data payloads is faster, - // only for CRC-32/ISO_HDLC on non-VPCLMULQDQ platforms - Calculator::calculate(state, data, params) - } - - #[cfg(not(target_arch = "x86_64"))] - // Call the FFI function for CRC-32/ISO-HDLC for all payloads non-x86_64 - return bindings::crc32_iso_hdlc(state, data, params); - } - - #[cfg(not(optimized_crc32_iso_hdlc))] - { - Calculator::calculate(state, data, params) - } +/// Because aarch64 has native hardware support for CRC-32/ISO-HDLC, we can use fusion techniques +/// to accelerate the calculation beyond what SIMD can do alone. x86 does not have native support, +/// so we use the traditional calculation. +#[inline(always)] +fn crc32_iso_hdlc_calculator(state: u64, data: &[u8], _params: CrcParams) -> u64 { + // aarch64 CPUs have native CRC-32/ISO-HDLC support, so we can use the fusion implementation + #[cfg(target_arch = "aarch64")] + return fusion::crc32_iso_hdlc(state as u32, data) as u64; + + // x86 CPUs don't have native CRC-32/ISO-HDLC support, so there's no fusion to be had, use + // traditional calculation + #[cfg(not(target_arch = "aarch64"))] + Calculator::calculate(state, data, _params) } #[cfg(test)] @@ -551,6 +514,7 @@ mod lib { use super::*; use crate::test::consts::{TEST_ALL_CONFIGS, TEST_CHECK_STRING}; + use crate::test::enums::AnyCrcTestConfig; use cbindgen::Language::{Cxx, C}; use cbindgen::Style::Both; use rand::{rng, Rng}; @@ -591,68 +555,52 @@ mod lib { #[test] fn test_small_all_lengths() { - let mut rng = rng(); - - // Test each CRC-64 variant for config in TEST_ALL_CONFIGS { // Test each length from 1 to 255 for len in 1..=255 { - // Generate random data for this length - let mut data = vec![0u8; len]; - rng.fill(&mut data[..]); - - // Calculate expected CRC using the reference implementation - let expected = config.checksum_with_reference(&data); - - let result = checksum(config.get_algorithm(), &data); - - assert_eq!(result, expected); + test_length(len, config); } } } #[test] fn test_medium_lengths() { - let mut rng = rng(); - - // Test each CRC-64 variant for config in TEST_ALL_CONFIGS { // Test each length from 256 to 1024, which should fold and include handling remainders for len in 256..=1024 { - // Generate random data for this length - let mut data = vec![0u8; len]; - rng.fill(&mut data[..]); - - // Calculate expected CRC using the reference implementation - let expected = config.checksum_with_reference(&data); - - let result = checksum(config.get_algorithm(), &data); - - assert_eq!(result, expected); + test_length(len, config); } } } #[test] fn test_large_lengths() { - let mut rng = rng(); - - // Test each CRC-64 variant for config in TEST_ALL_CONFIGS { // Test 1 MiB just before, at, and just after the folding boundaries for len in 1048575..1048577 { - // Generate random data for this length - let mut data = vec![0u8; len]; - rng.fill(&mut data[..]); + test_length(len, config); + } + } + } - // Calculate expected CRC using the reference implementation - let expected = config.checksum_with_reference(&data); + fn test_length(length: usize, config: &AnyCrcTestConfig) { + let mut data = vec![0u8; length]; + rng().fill(&mut data[..]); - let result = checksum(config.get_algorithm(), &data); + // Calculate expected CRC using the reference implementation + let expected = config.checksum_with_reference(&data); - assert_eq!(result, expected); - } - } + let result = checksum(config.get_algorithm(), &data); + + assert_eq!( + result, + expected, + "Failed for algorithm: {:?}, length: {}, expected: {:#x}, got: {:#x}", + config.get_algorithm(), + length, + expected, + result + ); } #[test] @@ -803,126 +751,53 @@ mod lib { return Ok(()); } - #[cfg(not(target_os = "windows"))] - { - const HEADER: &str = "libcrc_fast.h"; - - let crate_dir = - std::env::var("CARGO_MANIFEST_DIR").map_err(|error| error.to_string())?; - - let mut expected = Vec::new(); - cbindgen::Builder::new() - .with_crate(crate_dir) - .with_include_guard("CRC_FAST_H") - .with_header("/* crc_fast library C/C++ API - Copyright 2025 Don MacAskill */\n/* This header is auto-generated. Do not edit directly. */\n") - // exclude internal implementation functions - .exclude_item("crc32_iscsi_impl") - .exclude_item("crc32_iso_hdlc_impl") - .exclude_item("get_iscsi_target") - .exclude_item("get_iso_hdlc_target") - .exclude_item("ISO_HDLC_TARGET") - .exclude_item("ISCSI_TARGET") - .exclude_item("CrcParams") - .rename_item("Digest", "CrcFastDigest") - .with_style(Both) - // generate C header - .with_language(C) - // with C++ compatibility - .with_cpp_compat(true) - .generate() - .map_err(|error| error.to_string())? - .write(&mut expected); - - // Convert the expected bytes to string for pattern replacement, since cbindgen - // generates an annoying amount of empty contiguous newlines - let header_content = String::from_utf8(expected).map_err(|error| error.to_string())?; - - // Replace excessive newlines (3 or more consecutive newlines) with 2 newlines - let regex = regex::Regex::new(r"\n{3,}").map_err(|error| error.to_string())?; - let cleaned_content = regex.replace_all(&header_content, "\n\n").to_string(); - - // Convert back to bytes - expected = cleaned_content.into_bytes(); - - let actual = read(HEADER).map_err(|error| error.to_string())?; - - if expected != actual { - write(HEADER, expected).map_err(|error| error.to_string())?; - return Err(format!( - "{HEADER} is not up-to-date, commit the generated file and try again" - )); - } - - Ok(()) - } - } - - /// Tests whether the CRC-32/ISO-HDLC bindings are up-to-date - #[test] - #[cfg(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64"))] - fn test_crc32_iso_hdlc_bindings() -> Result<(), String> { - build_bindgen("crc32_iso_hdlc", "src/bindings/crc32_iso_hdlc.rs") - } - - /// Tests whether the CRC-32/ISCSI bindings are up-to-date - #[test] - #[cfg(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64"))] - fn test_crc32_iscsi_bindings() -> Result<(), String> { - build_bindgen("crc32_iscsi", "src/bindings/crc32_iscsi.rs") - } - - fn build_bindgen(name: &str, bindings_path: &str) -> Result<(), String> { - // Getting the Rust cross compile toolchain working on x86 such that it builds correctly - // _and_ can validate the header output via bindgen is non-obvious. Since I doubt many - // people are actually doing development work on x86, as opposed to x86_64 or aarch64, - // I'm just going to skip the bindgen tests on x86. The important tests (do these - // CRC-32 variants actually work?) is covered by the other tests, this is just a - // development artifact test. - - #[cfg(target_arch = "x86")] - { - eprintln!("Skipping test on x86 for {} to {}", name, bindings_path); - - return Ok(()); - } - - // Skip this test on Windows, since CRLF vs LF is a PITA - #[cfg(target_os = "windows")] - { - // Skip this test on Windows, since CRLF vs LF is a PITA - eprintln!("Skipping test on Windows"); - - return Ok(()); + const HEADER: &str = "libcrc_fast.h"; + + let crate_dir = std::env::var("CARGO_MANIFEST_DIR").map_err(|error| error.to_string())?; + + let mut expected = Vec::new(); + cbindgen::Builder::new() + .with_crate(crate_dir) + .with_include_guard("CRC_FAST_H") + .with_header("/* crc_fast library C/C++ API - Copyright 2025 Don MacAskill */\n/* This header is auto-generated. Do not edit directly. */\n") + // exclude internal implementation functions + .exclude_item("crc32_iscsi_impl") + .exclude_item("crc32_iso_hdlc_impl") + .exclude_item("get_iscsi_target") + .exclude_item("get_iso_hdlc_target") + .exclude_item("ISO_HDLC_TARGET") + .exclude_item("ISCSI_TARGET") + .exclude_item("CrcParams") + .rename_item("Digest", "CrcFastDigest") + .with_style(Both) + // generate C header + .with_language(C) + // with C++ compatibility + .with_cpp_compat(true) + .generate() + .map_err(|error| error.to_string())? + .write(&mut expected); + + // Convert the expected bytes to string for pattern replacement, since cbindgen + // generates an annoying amount of empty contiguous newlines + let header_content = String::from_utf8(expected).map_err(|error| error.to_string())?; + + // Replace excessive newlines (3 or more consecutive newlines) with 2 newlines + let regex = regex::Regex::new(r"\n{3,}").map_err(|error| error.to_string())?; + let cleaned_content = regex.replace_all(&header_content, "\n\n").to_string(); + + // Convert back to bytes + expected = cleaned_content.into_bytes(); + + let actual = read(HEADER).map_err(|error| error.to_string())?; + + if expected != actual { + write(HEADER, expected).map_err(|error| error.to_string())?; + return Err(format!( + "{HEADER} is not up-to-date, commit the generated file and try again" + )); } - #[cfg(not(any(target_arch = "x86", target_os = "windows")))] - { - let bindings = bindgen::Builder::default() - .header(format!("include/{name}.h")) - .allowlist_function("crc32_iscsi_impl") - .allowlist_function("get_iscsi_target") - .allowlist_var("ISCSI_TARGET") - .allowlist_function("crc32_iso_hdlc_impl") - .allowlist_function("get_iso_hdlc_target") - .allowlist_var("ISO_HDLC_TARGET") - .generate() - .expect("Unable to generate bindings"); - - let expected = bindings.to_string().into_bytes(); - - let actual = read(bindings_path).map_err(|error| error.to_string())?; - - if expected != actual { - bindings - .write_to_file(bindings_path) - .expect("Couldn't write bindings to SRC!"); - - return Err(format!( - "{bindings_path} is not up-to-date, commit the generated file and try again" - )); - } - - Ok(()) - } + Ok(()) } } diff --git a/src/test/mod.rs b/src/test/mod.rs index 9ac64c5..1b74ec4 100644 --- a/src/test/mod.rs +++ b/src/test/mod.rs @@ -5,8 +5,8 @@ #![cfg(test)] #![allow(dead_code)] -pub mod consts; -mod enums; +pub(crate) mod consts; +pub(crate) mod enums; mod structs; /// Creates a new aligned data vector from the input slice for testing.