diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index fa47cb7..3f6c8ee 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -27,7 +27,7 @@ jobs:
       - name: Architecture check
         run: cargo run arch-check
       - name: Architecture check (Optimized)
-        run: cargo run --features=optimize_crc32_auto arch-check
+        run: cargo run arch-check
       - if: ${{ matrix.rust-toolchain != 'nightly' }}
         name: Format
         run: cargo fmt -- --check
@@ -36,8 +36,6 @@ jobs:
         run: cargo clippy
       - name: Test
         run: cargo test
-      - name: Test (Optimized)
-        run: cargo test --features=optimize_crc32_auto
 
   test-x86:
     name:  Test accelerated (x86)
@@ -61,8 +59,6 @@ jobs:
         run: cross check --target ${{ matrix.target }}
       - name: Test
         run: cross test --target ${{ matrix.target }}
-      - name: Test (Optimized)
-        run: cross test --features=optimize_crc32_auto --target ${{ matrix.target }}
 
   test-software:
     name: Test software fallback
@@ -85,6 +81,4 @@ jobs:
       - name: Check
         run: cross check --target ${{ matrix.target }}
       - name: Test
-        run: cross test --target ${{ matrix.target }}
-      - name: Test (Optimized)
-        run: cross test --features=optimize_crc32_auto --target ${{ matrix.target }}
\ No newline at end of file
+        run: cross test --target ${{ matrix.target }}
\ No newline at end of file
diff --git a/Cargo.lock b/Cargo.lock
index 4c7e1b9..ce0fa22 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -82,7 +82,7 @@ dependencies = [
  "bitflags",
  "cexpr",
  "clang-sys",
- "itertools 0.13.0",
+ "itertools",
  "log",
  "prettyplease",
  "proc-macro2",
@@ -139,17 +139,6 @@ dependencies = [
  "toml",
 ]
 
-[[package]]
-name = "cc"
-version = "1.2.22"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "32db95edf998450acc7881c932f94cd9b05c87b4b2599e8bab064753da4acfd1"
-dependencies = [
- "jobserver",
- "libc",
- "shlex",
-]
-
 [[package]]
 name = "cexpr"
 version = "0.6.0"
@@ -257,7 +246,6 @@ version = "1.2.2"
 dependencies = [
  "bindgen",
  "cbindgen",
- "cc",
  "crc",
  "criterion",
  "digest",
@@ -278,7 +266,7 @@ dependencies = [
  "clap",
  "criterion-plot",
  "is-terminal",
- "itertools 0.10.5",
+ "itertools",
  "num-traits",
  "once_cell",
  "oorandom",
@@ -299,7 +287,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1"
 dependencies = [
  "cast",
- "itertools 0.10.5",
+ "itertools",
 ]
 
 [[package]]
@@ -473,31 +461,12 @@ dependencies = [
  "either",
 ]
 
-[[package]]
-name = "itertools"
-version = "0.13.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "413ee7dfc52ee1a4949ceeb7dbc8a33f2d6c088194d9f922fb8318faf1f01186"
-dependencies = [
- "either",
-]
-
 [[package]]
 name = "itoa"
 version = "1.0.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c"
 
-[[package]]
-name = "jobserver"
-version = "0.1.33"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "38f262f097c174adebe41eb73d66ae9c06b2844fb0da69969647bbddd9b0538a"
-dependencies = [
- "getrandom",
- "libc",
-]
-
 [[package]]
 name = "js-sys"
 version = "0.3.77"
@@ -516,9 +485,9 @@ checksum = "d750af042f7ef4f724306de029d18836c26c1765a54a6a3f094cbd23a7267ffa"
 
 [[package]]
 name = "libloading"
-version = "0.8.6"
+version = "0.8.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fc2f4eb4bc735547cfed7c0a4922cbd04a4655978c09b54f1f7b228750664c34"
+checksum = "07033963ba89ebaf1584d767badaa2e8fcec21aedea6b8c0346d487d49c28667"
 dependencies = [
  "cfg-if",
  "windows-targets",
@@ -618,9 +587,9 @@ dependencies = [
 
 [[package]]
 name = "prettyplease"
-version = "0.2.32"
+version = "0.2.33"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "664ec5419c51e34154eec046ebcba56312d5a2fc3b09a06da188e1ad21afadf6"
+checksum = "9dee91521343f4c5c6a63edd65e54f31f5c92fe8978c40a4282f8372194c6a7d"
 dependencies = [
  "proc-macro2",
  "syn",
diff --git a/Cargo.toml b/Cargo.toml
index 15a261f..fba702e 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -30,9 +30,6 @@ criterion = "0.5"
 cbindgen = "0.28"
 bindgen = "0.70" # 0.70 is the last version that supports Rust 1.81 due to 'unsafe extern' blocks
 
-[build-dependencies]
-cc = { version = "1.2", features = ["parallel"] }
-
 # lto=true has a big improvement in performance
 [profile.release]
 lto = true
@@ -47,36 +44,15 @@ harness = false
 [features]
 alloc = []
 
-# enable VPCLMULQDQ support in Rust for x86_64 using nightly toolchain builds
+# enable experimental VPCLMULQDQ support, which landed in Rust 1.89.0-nightly, will deprecate after 1.89.0 is stable
 vpclmulqdq = []
 
-# enable AVX512 support in Rust for x86_64 using nightly toolchain builds
-avx512 = []
-
-# enable using fast-crc32 optimized C implementations for CRC-32/ISCSI and CRC-32/ISO-HDLC, automatically detected
-optimize_crc32_auto = []
-
-# the following features enable forcing custom optimized build features (rather than "auto" which attemps to pick the
-# best) for CRC-32/ISCSI and CRC-32/ISO-HDLC calculations, since architecture support and performance varies
-
-# aarch64 NEON options
-optimize_crc32_neon_eor3_v9s3x2e_s3 = []
-optimize_crc32_neon_v12e_v1 = []
-optimize_crc32_neon_v3s4x2e_v2 = []
-
-# blends eor3_v9s3x2e_s3 for "large" (>1KiB) payloads, and v12e_v1 for "small" ones, which tends to yield the best
-# results on modern aarch64 such as Graviton and Apple Silicon
-optimize_crc32_neon_blended = []
-
-# x86 SSE+ options
-# this will blend automagically for CRC-32/ISO-HDLC which tends to have poor hardware support, but typically great
-# support for CRC-32/ISCSI
-optimize_crc32_avx512_vpclmulqdq_v3x2 = []
-
-# non-blended alternatives
-optimize_crc32_avx512_v4s3x3 = []
-optimize_crc32_sse_v4s3x3 = []
-
-[lints.rust]
-# build-time feature enablement
-unexpected_cfgs = { level = "warn", check-cfg = ['cfg(optimized_crc32_iscsi)','cfg(optimized_crc32_iso_hdlc)' ] }
+# the features below aren't in use, are deprecated, and will be removed in the next MAJOR version
+optimize_crc32_auto = [] # deprecated
+optimize_crc32_neon_eor3_v9s3x2e_s3 = [] # deprecated
+optimize_crc32_neon_v12e_v1 = [] # deprecated
+optimize_crc32_neon_v3s4x2e_v2 = [] # deprecated
+optimize_crc32_neon_blended = [] # deprecated
+optimize_crc32_avx512_vpclmulqdq_v3x2 = [] # deprecated
+optimize_crc32_avx512_v4s3x3 = [] # deprecated
+optimize_crc32_sse_v4s3x3 = [] # deprecated
\ No newline at end of file
diff --git a/README.md b/README.md
index dee097f..6322a29 100644
--- a/README.md
+++ b/README.md
@@ -5,19 +5,19 @@
 [![Latest Version](https://img.shields.io/crates/v/crc-fast.svg)](https://crates.io/crates/crc-fast)
 [![Documentation](https://img.shields.io/badge/api-rustdoc-blue.svg)](https://docs.rs/crc-fast)
 
-Fast, hardware-accelerated CRC calculation for 
+Fast, hardware-accelerated CRC calculation for
 [all known CRC-32 and CRC-64 variants](https://reveng.sourceforge.io/crc-catalogue/all.htm) using SIMD intrinsics,
-which can exceed _100GiB/s_ for `CRC-32`, and _50GiB/s_ for `CRC-64`, on modern systems. 
+which can exceed [100GiB/s](#performance) on modern systems.
 
-Supports acceleration on `aarch64`, `x86_64`, and `x86` architectures, plus has a safe non-accelerated software 
-fallback for other architectures.
+Supports acceleration on `aarch64`, `x86_64`, and `x86` architectures, plus has a safe non-accelerated table-based 
+software fallback for others.
 
-The [crc crate](https://crates.io/crates/crc) is ~0.5GiB/s by default, so this is 
-[up to >200X faster](#tldr-just-tell-me-how-to-turn-it-up-to-11-), and even the most conservative baseline settings 
+The [crc crate](https://crates.io/crates/crc) is ~0.5GiB/s by default, so this is
+[up to >220X faster](#tldr-just-tell-me-how-to-turn-it-up-to-11-), and even the most conservative baseline settings
 are >27X.
 
-This is unique, not just because of the performance, but also because I couldn't find a single generic SIMD-accelerated 
-implementation (in any language) which worked for _all_ known variants, using the 
+This is unique, not just because of the performance, but also because I couldn't find a single generic SIMD-accelerated
+implementation (in any language) which worked for _all_ known variants, using the
 [Rocksoft model](http://www.ross.net/crc/download/crc_v3.txt), especially the "non-reflected" variants.
 
 So I wrote one.
@@ -26,6 +26,12 @@ So I wrote one.
 
 Supplies a [C/C++ compatible shared library](#cc-compatible-shared-library) for use with other non-`Rust` languages.
 
+## Implementations
+
+* [AWS SDK for Rust](https://awslabs.github.io/aws-sdk-rust/) via
+  the [aws-smithy-checksums](https://crates.io/crates/aws-smithy-checksums) crate.
+* [crc-fast-php-ext](https://github.com/awesomized/crc-fast-php-ext) `PHP` extension using this library.
+
 ## Changes
 
 See [CHANGELOG](CHANGELOG.md).
@@ -33,10 +39,11 @@ See [CHANGELOG](CHANGELOG.md).
 ## Build & Install
 
 `cargo build` will obviously build the library, including
-the [C-compatible shared library](#c-compatible-shared-library). There are fine-tuning [feature flags](Cargo.toml) 
+the [C-compatible shared library](#c-compatible-shared-library). There are fine-tuning [feature flags](Cargo.toml)
 available, should they be necessary for your deployment and [acceleration](#acceleration-targets) targets.
 
-A _very_ basic [Makefile](Makefile) is supplied which supports `make install` to install the shared library and header file to
+A _very_ basic [Makefile](Makefile) is supplied which supports `make install` to install the shared library and header
+file to
 the local system. Specifying the `DESTDIR` environment variable will allow you to customize the install location.
 
 ```
@@ -47,16 +54,15 @@ You'll need to adjust if you want to optimize with [feature flags](Cargo.toml).
 
 ## Usage
 
-Add `crc-fast = { version = "1.1", features = ["optimize_crc32_auto"] }` to your `Cargo.toml` dependencies, which will
-enable every available optimization for the `stable` toolchain. Adjust as necessary for your desired 
-[acceleration targets](#acceleration-targets).
+Add `crc-fast = version = "1.3"` to your `Cargo.toml` dependencies, which will enable every available optimization for
+the `stable` toolchain. Adjust as necessary for your desired [acceleration targets](#acceleration-targets).
 
 ### Digest
 
 Implements the [digest::DynDigest](https://docs.rs/digest/latest/digest/trait.DynDigest.html)
 trait for easier integration with existing Rust code.
 
-Creates a `Digest` which can be updated over time, for stream processing, intermittent workloads, etc, enabling 
+Creates a `Digest` which can be updated over time, for stream processing, intermittent workloads, etc, enabling
 finalizing the checksum once processing is complete.
 
  ```rust
@@ -87,7 +93,7 @@ let file_on_disk = binding.to_str().unwrap();
 // actual usage
 let mut digest = Digest::new(Crc32IsoHdlc);
 let mut file = File::open(file_on_disk).unwrap();
-std::io::copy(&mut file, &mut digest).unwrap();
+std::io::copy( & mut file, & mut digest).unwrap();
 let checksum = digest.finalize();
 
 assert_eq!(checksum, 0xcbf43926);
@@ -138,17 +144,17 @@ assert_eq!(checksum.unwrap(), 0xcbf43926);
 
 ## C/C++ compatible shared library
 
-`cargo build` will produce a shared library target (`.so` on Linux, `.dll` on Windows, `.dylib` on macOS, etc) and an 
-auto-generated [libcrc_fast.h](libcrc_fast.h) header file for use in non-Rust projects, such as through 
+`cargo build` will produce a shared library target (`.so` on Linux, `.dll` on Windows, `.dylib` on macOS, etc) and an
+auto-generated [libcrc_fast.h](libcrc_fast.h) header file for use in non-Rust projects, such as through
 [FFI](https://en.wikipedia.org/wiki/Foreign_function_interface).
 
 There is a [crc-fast PHP extension](https://github.com/awesomized/crc-fast-php-ext) using it, for example.
 
 ## Background
 
-This implementation is based on Intel's 
+This implementation is based on Intel's
 [Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction](https://web.archive.org/web/20131224125630/https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf)
-white paper, though it folds 8-at-a-time, like other modern implementations, rather than the 4-at-a-time as in Intel's 
+white paper, though it folds 8-at-a-time, like other modern implementations, rather than the 4-at-a-time as in Intel's
 paper.
 
 This library works on `aarch64`, `x86_64`, and `x86` architectures, and is hardware-accelerated and optimized for each
@@ -157,7 +163,7 @@ architecture.
 Inspired by [`crc32fast`](https://crates.io/crates/crc32fast),
 [`crc64fast`](https://crates.io/crates/crc64fast),
 and [`crc64fast-nvme`](https://crates.io/crates/crc64fast-nvme), each of which only accelerates a single, different CRC
-variant, and all of them were "reflected" variants. 
+variant, and all of them were "reflected" variants.
 
 In contrast, this library accelerates _every known variant_ (and should accelerate any future variants without changes),
 including all the "non-reflected" variants.
@@ -169,185 +175,212 @@ stand out as being the most important and widely used (all of which are "reflect
 
 ### [CRC-32/ISCSI](https://reveng.sourceforge.io/crc-catalogue/all.htm#crc.cat.crc-32-iscsi)
 
-Many, but not all, implementations simply call this `crc32c` and it's probably the 2nd most popular and widely used, 
+Many, but not all, implementations simply call this `crc32c` and it's probably the 2nd most popular and widely used,
 after `CRC-32/ISO-HDLC`. It's used in `iSCSI`, `ext4`, `btrfs`, etc.
 
+Both `x86_64` and `aarch64` have native hardware support for this CRC variant, so we can use
+[fusion](https://www.corsix.org/content/fast-crc32c-4k) in many cases to accelerate it further by fusing SIMD CLMUL
+instructions with the native CRC instructions.
+
 ### [CRC-32/ISO-HDLC](https://reveng.sourceforge.io/crc-catalogue/all.htm#crc.cat.crc-32-iso-hdlc)
 
 Many, but not all, implementations simply call this `crc32` and it may be the most popular and widely used. It's used in
 `Ethernet`, `PKZIP`, `xz`, etc.
 
+Only `aarch64` has native hardware support for this CRC variant, so we can use
+[fusion](https://www.corsix.org/content/fast-crc32c-4k) on that platform, but not `x86_64`.
+
 ### [CRC-64/NVME](https://reveng.sourceforge.io/crc-catalogue/all.htm#crc.cat.crc-64-nvme)
 
-`CRC-64/NVME` comes from the [NVM Express® NVM Command Set Specification](https://nvmexpress.org/wp-content/uploads/NVM-Express-NVM-Command-Set-Specification-1.0d-2023.12.28-Ratified.pdf)
+`CRC-64/NVME` comes from
+the [NVM Express® NVM Command Set Specification](https://nvmexpress.org/wp-content/uploads/NVM-Express-NVM-Command-Set-Specification-1.0d-2023.12.28-Ratified.pdf)
 (Revision 1.0d, December 2023),
 is [AWS S3's recommended checksum option](https://docs.aws.amazon.com/AmazonS3/latest/userguide/checking-object-integrity.html)
-(as `CRC64-NVME`), and has also been implemented in the 
+(as `CRC64-NVME`), and has also been implemented in the
 [Linux kernel](https://github.com/torvalds/linux/blob/786c8248dbd33a5a7a07f7c6e55a7bfc68d2ca48/lib/crc64.c#L66-L73)
-(where it's called `CRC-64/Rocksoft`).
+(where it's been called `CRC-64/Rocksoft` in the past).
 
 Note that the `Check` value in the `NVMe` spec uses incorrect endianness (see `Section 5.2.1.3.4, Figure 120, page 83`)
 but all known public & private implementations agree on the correct value, which this library produces.
 
 # Acceleration targets
 
-This library has baseline support for accelerating all known `CRC-32` and `CRC-64` variants on `aarch64`, `x86_64`, and 
-`x86` internally in pure `Rust`. It's extremely fast (up to dozens of GiB/s). This is the default if no feature flags are
-specified. 
-
-With feature flags, it can be even faster. 😎
+This library has baseline support for accelerating all known `CRC-32` and `CRC-64` variants on `aarch64`, `x86_64`, and
+`x86` internally in pure `Rust`. It's extremely fast (up to dozens of GiB/s) by default if no feature flags are
+used.
 
 ### tl;dr: Just tell me how to turn it up to 11! 🤘
 
-For modern `x86_64` systems (requires `nightly` toolchain) which further accelerates _all_ variants, especially
-`CRC-32/ISCSI` and `CRC-32/ISO-HDLC`: 
-```
-rustup toolchain install nightly
-cargo +nightly build --release --features=optimize_crc32_auto,vpclmulqdq
-```
+For `aarch64` and older `x86_64` systems, the release build will use the best available acceleration:
 
-For `aarch64`, and older `x86_64` / `x86`, systems (no `nightly` required) which further accelerates `CRC-32/ISCSI` and 
-`CRC-32/ISO-HDLC`:
 ```
-cargo build --release --features=optimize_crc32_auto
+cargo build --release
 ```
 
-At [Awesome](https://awesome.co/), we use these 👆 at large scale in production at [Flickr](https://flickr.com/) and 
-[SmugMug](https://www.smugmug.com/).
-
-### CRC-32/ISO-HDLC and CRC-32/ISCSI optimization
+For modern `x86_64` systems, you can enable [experimental VPCLMULQDQ support](#experimental-vpclmulqdq-support-in-rust)
+for a ~2X performance boost.
 
-By using the `optimize_crc32_auto` feature flag, the library will use 
-[fast-crc32](https://github.com/corsix/fast-crc32/) instead to accelerate _only_ `CRC-32/ISO-HDLC` and/or `CRC-32/ISCSI` 
-using a [fusion](https://www.corsix.org/content/fast-crc32c-4k) of hardware `crc32(c)` support and `PCLMULQDQ`. 
-
-`fast-crc32` does not accelerate any other `CRC-32` variants, or any `CRC-64` variants, since none of the others have 
-native hardware-acceleration support in any CPUs which would enable `fusion`.
+At [Awesome](https://awesome.co/), we use these 👆 at large scale in production at [Flickr](https://flickr.com/) and
+[SmugMug](https://www.smugmug.com/).
 
-`fast-crc32` will use `VPCLMULQDQ` if available, without requiring the need for `nightly` builds (since it's an external
-C implementation).
+### Checking your platform capabilities
 
 There's an [arch-check](src/bin/arch-check.rs) binary which will explain the selected target architecture.
 
 ```
 // test it works on your system (patches welcome!)
-cargo test --features=optimize_crc32_auto
+cargo test
 
 // examine the chosen acceleration targets
-cargo run --features=optimize_crc32_auto arch-check
+cargo run arch-check
 
 // build for release
-cargo build --features=optimize_crc32_auto --release
+cargo build --release
 ```
 
-There are additional [feature flags](Cargo.toml) to force certain implementations for fine-tuning, benchmarking, etc.
-
 ### Experimental VPCLMULQDQ support in Rust
 
-This library also supports [VPCLMULQDQ](https://en.wikichip.org/wiki/x86/vpclmulqdq) for accelerating all 
-`CRC-32` and `CRC-64` variants on modern `x86_64` platforms which support it when using `nightly` builds and the 
-`vpclmulqdq` feature flag. 
+This library also supports [VPCLMULQDQ](https://en.wikichip.org/wiki/x86/vpclmulqdq) for accelerating all `CRC-32` and
+`CRC-64` variants on modern `x86_64`
+platforms which support it when using `nightly` builds and the `vpclmulqdq` feature flag.
 
-Typical performance boosts are ~2X, and they apply to CPUs beginning with Intel 
-[Ice Lake](https://en.wikipedia.org/wiki/Ice_Lake_%28microprocessor%29) (Sep 2019) and AMD 
-[Zen4](https://en.wikipedia.org/wiki/Zen_4) (Sep 2022).
+Typical performance boosts are ~2X, and they apply to CPUs beginning with Intel
+[Ice Lake](https://en.wikipedia.org/wiki/Ice_Lake_%28microprocessor%29) (Sep 2019) and
+AMD [Zen4](https://en.wikipedia.org/wiki/Zen_4) (Sep 2022).
 
 ```
 rustup toolchain install nightly
 cargo +nightly build --release --features=vpclmulqdq
 ```
 
-There's a [tracking issue](https://github.com/rust-lang/rust/issues/111137) for when these features might land on 
-`stable`, which looks like [very soon](https://github.com/rust-lang/rust/issues/111137#issuecomment-2787196977), at
-which point this library will adopt it as a default.
+`AVX512` support with `VPCLMULQDQ` is stabilized on [1.89.0](https://releases.rs/docs/1.89.0/), so once that becomes
+stable in August 2025, this library will be updated to use it by default without needing the `nightly` toolchain.
 
 ## Performance
 
-Modern systems can exceed 100GiB/s for calculating `CRC-32/ISCSI` and `CRC-32/ISO-HDLC`, and 50GiB/s for calculating
-`CRC-64/NVME`.
+Modern systems can exceed 100 GiB/s for calculating `CRC-32/ISCSI`, `CRC-32/ISO-HDLC`,
+`CRC-64/NVME`, and all other reflected variants. (Forward variants are slower, due to the extra shuffle-masking, but
+are still extremely fast in this library).
 
-This is a summary of the best [targets](#acceleration-targets) for the most important and popular CRC checksums. More 
-extensive benchmark results, with other targets and variants, can be found in the [benches](benches/README.md) folder.
+This is a summary of the best [targets](#acceleration-targets) for the most important and popular CRC checksums.
 
-### CRC-32/ISCSI
+### CRC-32/ISCSI (reflected)
 
 AKA `crc32c` in many, but not all, implementations.
 
-| Arch    | Brand | CPU             | System                    | Target                 | 1KiB (GiB/s) | 1MiB (GiB/s) |
-|:--------|:------|:----------------|:--------------------------|:-----------------------|-------------:|-------------:|
-| x86_64  | Intel | Sapphire Rapids | EC2 c7i.metal-48xl        | avx512_vpclmulqdq_v3x2 |        ~38.0 |       ~111.7 |
-| x86_64  | AMD   | Genoa           | EC2 c7a.metal-48xl        | avx512_vpclmulqdq_v3x2 |        ~21.1 |        ~54.6 |
-| aarch64 | AWS   | Graviton4       | EC2 c8g.metal-48xl        | neon_blended           |        ~18.5 |        ~31.6 |
-| aarch64 | Apple | M3 Ultra        | Mac Studio (32 core)      | neon_v12e_v1           |        ~54.8 |        ~99.6 |
-| aarch64 | Apple | M4 Max          | MacBook Pro 16" (16 core) | neon_blended           |        ~60.8 |        ~96.3 |
-| aarch64 | Apple | M2 Ultra        | Mac Studio (24 core)      | neon_blended           |        ~50.3 |        ~87.6 |
+| Arch    | Brand | CPU             | System                    | Target              | 1KiB (GiB/s) | 1MiB (GiB/s) |
+|:--------|:------|:----------------|:--------------------------|:--------------------|-------------:|-------------:|
+| x86_64  | Intel | Sapphire Rapids | EC2 c7i.metal-24xl        | avx512-vpclmulqdq*  |          ~49 |         ~111 |
+| x86_64  | Intel | Sapphire Rapids | EC2 c7i.metal-24xl        | sse-pclmulqdq       |          ~18 |          ~52 |
+| x86_64  | AMD   | Genoa           | EC2 c7a.metal-48xl        | avx512-vpclmulqdq*  |          ~23 |          ~54 |
+| x86_64  | AMD   | Genoa           | EC2 c7a.metal-48xl        | sse-pclmulqdq       |          ~11 |          ~20 |
+| aarch64 | AWS   | Graviton4       | EC2 c8g.metal-48xl        | neon-eor3-pclmulqdq |          ~19 |          ~39 |
+| aarch64 | AWS   | Graviton2       | EC2 c6g.metal             | neon-pclmulqdq      |          ~10 |          ~17 |
+| aarch64 | Apple | M3 Ultra        | Mac Studio (32 core)      | neon-eor3-pclmulqdq |          ~49 |          ~99 |
+| aarch64 | Apple | M4 Max          | MacBook Pro 16" (16 core) | neon-eor3-pclmulqdq |          ~56 |          ~94 |
 
-### CRC-32/ISO-HDLC
+### CRC-32/ISO-HDLC (reflected)
 
 AKA `crc32` in many, but not all, implementations.
 
-| Arch    | Brand | CPU             | System                    | Target       | 1KiB (GiB/s) | 1MiB (GiB/s) |
-|:--------|:------|:----------------|:--------------------------|:-------------|-------------:|-------------:|
-| x86_64  | Intel | Sapphire Rapids | EC2 c7i.metal-48xl        | avx2_blended |        ~16.6 |       ~110.4 |
-| x86_64  | AMD   | Genoa           | EC2 c7a.metal-48xl        | avx2_blended |        ~17.2 |        ~53.8 |
-| aarch64 | AWS   | Graviton4       | EC2 c8g.metal-48xl        | neon_blended |        ~18.5 |        ~31.5 |
-| aarch64 | Apple | M3 Ultra        | Mac Studio (32 core)      | neon_v12e_v1 |        ~56.5 |        ~98.8 |
-| aarch64 | Apple | M4 Max          | MacBook Pro 16" (16 core) | neon_v12e_v1 |        ~59.2 |       ~105.3 |
-| aarch64 | Apple | M2 Ultra        | Mac Studio (24 core)      | neon_blended |        ~50.1 |        ~87.0 |
-
-### CRC-64/NVME
-
-| Arch    | Brand | CPU             | System               | Target              | 1KiB (GiB/s) | 1MiB (GiB/s) |
-|:--------|:------|:----------------|:---------------------|:--------------------|-------------:|-------------:|
-| x86_64  | Intel | Sapphire Rapids | EC2 c7i.metal-24xl   | avx512_vpclmulqdq   |        ~24.9 |       ~109.7 |
-| x86_64  | AMD   | Genoa           | EC2 c7a.metal-48xl   | avx512_vpclmulqdq   |        ~24.4 |        ~54.6 |
-| aarch64 | AWS   | Graviton4       | EC2 c8g.metal-48xl   | neon_pclmulqdq_eor3 |        ~18.7 |        ~36.8 |
-| aarch64 | AWS   | Graviton2       | EC2 c6g.metal        | neon_pclmulqdq      |         ~9.8 |        ~15.9 |
-| aarch64 | Apple | M3 Ultra        | Mac Studio (32 core) | neon_pclmulqdq_eor3 |        ~49.5 |        ~71.9 |
+| Arch    | Brand | CPU             | System                    | Target              | 1KiB (GiB/s) | 1MiB (GiB/s) |
+|:--------|:------|:----------------|:--------------------------|:--------------------|-------------:|-------------:|
+| x86_64  | Intel | Sapphire Rapids | EC2 c7i.metal-248xl       | avx512-vpclmulqdq*  |          ~24 |         ~110 |
+| x86_64  | Intel | Sapphire Rapids | EC2 c7i.metal-248xl       | sse-pclmulqdq       |          ~21 |          ~28 |
+| x86_64  | AMD   | Genoa           | EC2 c7a.metal-48xl        | avx512-vpclmulqdq*  |          ~24 |          ~55 |
+| x86_64  | AMD   | Genoa           | EC2 c7a.metal-48xl        | sse-pclmulqdq       |          ~12 |          ~14 |
+| aarch64 | AWS   | Graviton4       | EC2 c8g.metal-48xl        | neon-eor3-pclmulqdq |          ~19 |          ~39 |
+| aarch64 | AWS   | Graviton2       | EC2 c6g.metal             | neon-pclmulqdq      |          ~10 |          ~17 |
+| aarch64 | Apple | M3 Ultra        | Mac Studio (32 core)      | neon-eor3-pclmulqdq |          ~48 |          ~98 |
+| aarch64 | Apple | M4 Max          | MacBook Pro 16" (16 core) | neon-eor3-pclmulqdq |          ~56 |          ~94 |
+
+### CRC-64/NVME (reflected)
+
+[AWS S3's recommended checksum option](https://docs.aws.amazon.com/AmazonS3/latest/userguide/checking-object-integrity.html)
+
+| Arch    | Brand | CPU             | System                    | Target              | 1KiB (GiB/s) | 1MiB (GiB/s) |
+|:--------|:------|:----------------|:--------------------------|:--------------------|-------------:|-------------:|
+| x86_64  | Intel | Sapphire Rapids | EC2 c7i.metal-24xl        | avx512-vpclmulqdq*  |          ~25 |         ~110 |
+| x86_64  | Intel | Sapphire Rapids | EC2 c7i.metal-24xl        | sse-pclmulqdq       |          ~21 |          ~28 |
+| x86_64  | AMD   | Genoa           | EC2 c7a.metal-48xl        | avx512-vpclmulqdq*  |          ~25 |          ~55 |
+| x86_64  | AMD   | Genoa           | EC2 c7a.metal-48xl        | sse-pclmulqdq       |          ~11 |          ~14 |
+| aarch64 | AWS   | Graviton4       | EC2 c8g.metal-48xl        | neon-eor3-pclmulqdq |          ~20 |          ~37 |
+| aarch64 | AWS   | Graviton2       | EC2 c6g.metal             | neon-pclmulqdq      |          ~10 |          ~16 |
+| aarch64 | Apple | M3 Ultra        | Mac Studio (32 core)      | neon-eor3-pclmulqdq |          ~50 |          ~72 |
+| aarch64 | Apple | M4 Max          | MacBook Pro 16" (16 core) | neon-eor3-pclmulqdq |          ~52 |          ~72 |
+
+### CRC-32/BZIP2 (forward)
+
+| Arch    | Brand | CPU             | System                    | Target              | 1KiB (GiB/s) | 1MiB (GiB/s) |
+|:--------|:------|:----------------|:--------------------------|:--------------------|-------------:|-------------:|
+| x86_64  | Intel | Sapphire Rapids | EC2 c7i.metal-24xl        | avx512-vpclmulqdq*  |          ~23 |          ~56 |
+| x86_64  | Intel | Sapphire Rapids | EC2 c7i.metal-24xl        | sse-pclmulqdq       |          ~19 |          ~28 |
+| x86_64  | AMD   | Genoa           | EC2 c7a.metal-48xl        | avx512-vpclmulqdq*  |          ~21 |          ~43 |
+| x86_64  | AMD   | Genoa           | EC2 c7a.metal-48xl        | sse-pclmulqdq       |          ~11 |          ~13 |
+| aarch64 | AWS   | Graviton4       | EC2 c8g.metal-48xl        | neon-eor3-pclmulqdq |          ~16 |          ~32 |
+| aarch64 | AWS   | Graviton2       | EC2 c6g.metal             | neon-pclmulqdq      |           ~9 |          ~14 |
+| aarch64 | Apple | M3 Ultra        | Mac Studio (32 core)      | neon-eor3-pclmulqdq |          ~41 |          ~59 |
+| aarch64 | Apple | M4 Max          | MacBook Pro 16" (16 core) | neon-eor3-pclmulqdq |          ~47 |          ~64 |
+
+### CRC-64/ECMA-182 (forward)
+
+| Arch    | Brand | CPU             | System                    | Target              | 1KiB (GiB/s) | 1MiB (GiB/s) |
+|:--------|:------|:----------------|:--------------------------|:--------------------|-------------:|-------------:|
+| x86_64  | Intel | Sapphire Rapids | EC2 c7i.metal-24xl        | avx512-vpclmulqdq*  |          ~24 |          ~56 |
+| x86_64  | Intel | Sapphire Rapids | EC2 c7i.metal-24xl        | sse-pclmulqdq       |          ~19 |          ~28 |
+| x86_64  | AMD   | Genoa           | EC2 c7a.metal-48xl        | avx512-vpclmulqdq*  |          ~21 |          ~43 |
+| x86_64  | AMD   | Genoa           | EC2 c7a.metal-48xl        | sse-pclmulqdq       |          ~11 |          ~13 |
+| aarch64 | AWS   | Graviton4       | EC2 c8g.metal-48xl        | neon-eor3-pclmulqdq |          ~18 |          ~31 |
+| aarch64 | AWS   | Graviton2       | EC2 c6g.metal             | neon-pclmulqdq      |           ~9 |          ~14 |
+| aarch64 | Apple | M3 Ultra        | Mac Studio (32 core)      | neon-eor3-pclmulqdq |          ~40 |          ~59 |
+| aarch64 | Apple | M4 Max          | MacBook Pro 16" (16 core) | neon-eor3-pclmulqdq |          ~46 |          ~61 |
+
+\* = [Experimental VPCLMULQDQ support in Rust](#experimental-vpclmulqdq-support-in-rust) is enabled.
 
 ## Other CRC widths
 
-There are [a lot of other known CRC widths and variants](https://reveng.sourceforge.io/crc-catalogue/all.htm), ranging 
-from `CRC-3/GSM` to `CRC-82/DARC`, and everything in between. 
+There are [a lot of other known CRC widths and variants](https://reveng.sourceforge.io/crc-catalogue/all.htm), ranging
+from `CRC-3/GSM` to `CRC-82/DARC`, and everything in between.
 
-Since [Awesome](https://awesome.co) doesn't use any that aren't `CRC-32` or `CRC-64` in length, this library doesn't 
-currently support them, either. (It should support any newly created or discovered `CRC-32` and `CRC-64` variants, 
+Since [Awesome](https://awesome.co) doesn't use any that aren't `CRC-32` or `CRC-64` in length, this library doesn't
+currently support them, either. (It should support any newly created or discovered `CRC-32` and `CRC-64` variants,
 though, with zero changes other than defining the [Rocksoft](http://www.ross.net/crc/download/crc_v3.txt) parameters).
 
 In theory, much of the "heavy lifting" has been done, so it should be possible to add other widths with minimal effort.
 
 PRs welcome!
 
-## Implementations
-* [crc-fast-php-ext](https://github.com/awesomized/crc-fast-php-ext) `PHP` extension using this library.
-
 ## References
 
 * [crc32-fast](https://crates.io/crates/crc32fast) Original `CRC-32/ISO-HDLC` (`crc32`) implementation in `Rust`.
 * [crc64-fast](https://github.com/tikv/crc64fast) Original `CRC-64/XZ` implementation in `Rust`.
 * [crc64fast-nvme](https://github.com/awesomized/crc64fast-nvme) Original `CRC-64/NVME` implementation in `Rust`.
-* [Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction](https://web.archive.org/web/20131224125630/https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf) 
+* [Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction](https://web.archive.org/web/20131224125630/https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf)
   Intel's paper.
-* [NVM Express® NVM Command Set Specification](https://nvmexpress.org/wp-content/uploads/NVM-Express-NVM-Command-Set-Specification-1.0d-2023.12.28-Ratified.pdf) 
+* [NVM Express® NVM Command Set Specification](https://nvmexpress.org/wp-content/uploads/NVM-Express-NVM-Command-Set-Specification-1.0d-2023.12.28-Ratified.pdf)
   The NVMe spec, including `CRC-64-NVME` (with incorrect endian `Check` value in
   `Section 5.2.1.3.4, Figure 120, page 83`).
 * [CRC-64/NVME](https://reveng.sourceforge.io/crc-catalogue/all.htm#crc.cat.crc-64-nvme) The `CRC-64/NVME` quick
   definition.
-* [A PAINLESS GUIDE TO CRC ERROR DETECTION ALGORITHMS](http://www.ross.net/crc/download/crc_v3.txt) Best description of 
+* [A PAINLESS GUIDE TO CRC ERROR DETECTION ALGORITHMS](http://www.ross.net/crc/download/crc_v3.txt) Best description of
   CRC I've seen to date (and the definition of the Rocksoft model).
 * [Linux implementation](https://github.com/torvalds/linux/blob/786c8248dbd33a5a7a07f7c6e55a7bfc68d2ca48/lib/crc64.c)
   Linux implementation of `CRC-64/NVME`.
-* [MASM/C++ artifacts implementation](https://github.com/jeffareid/crc/) - Reference MASM/C++ implementation for 
+* [MASM/C++ artifacts implementation](https://github.com/jeffareid/crc/) - Reference MASM/C++ implementation for
   generating artifacts.
 * [Intel isa-l GH issue #88](https://github.com/intel/isa-l/issues/88) - Additional insight into generating artifacts.
-* [StackOverflow PCLMULQDQ CRC32 answer](https://stackoverflow.com/questions/71328336/fast-crc-with-pclmulqdq-not-reflected/71329114#71329114) 
+* [StackOverflow PCLMULQDQ CRC32 answer](https://stackoverflow.com/questions/71328336/fast-crc-with-pclmulqdq-not-reflected/71329114#71329114)
   Insightful answer to implementation details for CRC32.
-* [StackOverflow PCLMULQDQ CRC32 question](https://stackoverflow.com/questions/21171733/calculating-constants-for-crc32-using-pclmulqdq) 
+* [StackOverflow PCLMULQDQ CRC32 question](https://stackoverflow.com/questions/21171733/calculating-constants-for-crc32-using-pclmulqdq)
   Insightful question & answer to CRC32 implementation details.
 * [AWS S3 announcement about CRC64-NVME support](https://aws.amazon.com/blogs/aws/introducing-default-data-integrity-protections-for-new-objects-in-amazon-s3/)
 * [AWS S3 docs on checking object integrity using CRC64-NVME](https://docs.aws.amazon.com/AmazonS3/latest/userguide/checking-object-integrity.html)
 * [Vector Carry-Less Multiplication of Quadwords (VPCLMULQDQ) details](https://en.wikichip.org/wiki/x86/vpclmulqdq)
 * [Linux kernel updates by Eric Biggers to use VPCLMULQDQ, etc](https://lkml.org/lkml/2025/2/10/1367)
+* [Faster CRC32-C on x86](https://www.corsix.org/content/fast-crc32c-4k)
+* [Faster CRC32 on the Apple M1](https://dougallj.wordpress.com/2022/05/22/faster-crc32-on-the-apple-m1/)
+* [An alternative exposition of crc32_4k_pclmulqdq](https://www.corsix.org/content/alternative-exposition-crc32_4k_pclmulqdq)
+* [fast-crc32](https://github.com/corsix/fast-crc32) - implementations of fusion for two CRC-32 variants.
 
 ## License
 
diff --git a/benches/benchmark.rs b/benches/benchmark.rs
index aacbac8..fffb0a4 100644
--- a/benches/benchmark.rs
+++ b/benches/benchmark.rs
@@ -4,6 +4,7 @@ use crc_fast::checksum;
 use crc_fast::CrcAlgorithm;
 use criterion::*;
 use rand::{rng, RngCore};
+use std::time::Duration;
 
 pub const SIZES: &[(&str, i32); 2] = &[
     ("1 MiB", 1024 * 1024),
@@ -29,11 +30,12 @@ pub const SIZES: &[(&str, i32); 2] = &[
 ];
 
 // these are the most important algorithms in popular use, with forward/reflected coverage
-pub const CRC32_ALGORITHMS: &[CrcAlgorithm; 4] = &[
-    CrcAlgorithm::Crc32Autosar, // reflected, internal
-    CrcAlgorithm::Crc32Iscsi,   // reflected, custom
-    CrcAlgorithm::Crc32IsoHdlc, // reflected, custom
-    CrcAlgorithm::Crc32Bzip2,   // forward, internal
+pub const CRC32_ALGORITHMS: &[CrcAlgorithm; 3] = &[
+    // benchmark both CRC-32/ISCSI and CRC-32/ISO-HDLC since they're special flowers with lots of
+    // different acceleration targets.
+    CrcAlgorithm::Crc32Iscsi,   // reflected
+    CrcAlgorithm::Crc32IsoHdlc, // reflected
+    CrcAlgorithm::Crc32Bzip2,   // forward
 ];
 
 // these are the most important algorithms in popular use, with forward/reflected coverage
@@ -78,13 +80,9 @@ fn bench_crc32(c: &mut Criterion) {
     let mut group = c.benchmark_group("CRC-32");
 
     println!(
-        "CRC-32/ISCSI implementation {}",
+        "Acceleration target: {}",
         crc_fast::get_calculator_target(CrcAlgorithm::Crc32Iscsi)
     );
-    println!(
-        "CRC-32/ISO-HDLC implementation {}",
-        crc_fast::get_calculator_target(CrcAlgorithm::Crc32IsoHdlc)
-    );
 
     for (size_name, size) in SIZES {
         let buf = create_aligned_data(&*random_data(*size));
@@ -101,6 +99,7 @@ fn bench_crc32(c: &mut Criterion) {
 
             group.throughput(Throughput::Bytes(*size as u64));
             group.sample_size(1000);
+            group.measurement_time(Duration::from_secs(30));
 
             let bench_name = [alg_suffix.unwrap(), "(checksum)"].join(" ");
 
@@ -128,6 +127,11 @@ fn bench_crc32(c: &mut Criterion) {
 
 #[inline(always)]
 fn bench_crc64(c: &mut Criterion) {
+    println!(
+        "Acceleration target: {}",
+        crc_fast::get_calculator_target(CrcAlgorithm::Crc64Nvme)
+    );
+
     let mut group = c.benchmark_group("CRC-64");
 
     for (size_name, size) in SIZES {
@@ -145,6 +149,7 @@ fn bench_crc64(c: &mut Criterion) {
 
             group.throughput(Throughput::Bytes(*size as u64));
             group.sample_size(1000);
+            group.measurement_time(Duration::from_secs(30));
 
             let bench_name = [alg_suffix.unwrap(), "(checksum)"].join(" ");
 
@@ -170,6 +175,6 @@ fn bench_crc64(c: &mut Criterion) {
     }
 }
 
-criterion_group!(benches, bench_crc64, bench_crc32);
+criterion_group!(benches, bench_crc32, bench_crc64);
 
 criterion_main!(benches);
diff --git a/build.rs b/build.rs
deleted file mode 100644
index e7b3893..0000000
--- a/build.rs
+++ /dev/null
@@ -1,230 +0,0 @@
-#![allow(dead_code)]
-#![allow(unused)]
-
-extern crate cc;
-
-use cc::Build;
-use std::env;
-
-#[cfg(target_arch = "aarch64")]
-use std::arch::is_aarch64_feature_detected;
-
-#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
-use std::arch::is_x86_feature_detected;
-
-fn main() {
-    // Windows doesn't build the C bindings automatically, and since they're auto-generated from
-    // another project, I'm not inclined to fix it. The Rust implementation is still very fast.
-    #[cfg(target_os = "windows")]
-    return;
-
-    // build hardware optimized version
-    build_optimized();
-}
-
-/// Builds hardware-optimized versions of the CRC32 functions
-fn build_optimized() {
-    // in build scripts, the target architecture is only available via an environment variable
-    let target_arch = env::var("CARGO_CFG_TARGET_ARCH").unwrap();
-
-    if "aarch64" == target_arch {
-        return build_optimized_aarch64();
-    }
-
-    if "x86_64" == target_arch || "x86" == target_arch {
-        build_optimized_x86()
-    }
-
-    // fall back to Rust implementation
-}
-
-fn build_optimized_target_crc32_iscsi(name: &str, flags: &[String]) {
-    build_optimized_target(name, flags);
-
-    println!("cargo:rustc-cfg=optimized_crc32_iscsi");
-}
-
-fn build_optimized_target_crc32_iso_hdlc(name: &str, flags: &[String]) {
-    build_optimized_target(name, flags);
-
-    println!("cargo:rustc-cfg=optimized_crc32_iso_hdlc");
-}
-
-fn build_optimized_target(name: &str, flags: &[String]) {
-    // Create a longer-lived binding as suggested by the error message
-    let mut binding = Build::new();
-    let mut build = binding.file(format!("include/{name}.c")).include("include");
-
-    // Apply each flag individually
-    for flag in flags {
-        build = build.flag(flag);
-    }
-
-    build.compile(name);
-}
-
-fn build_optimized_aarch64() {
-    // feature flag overrides to allow forcing a specific implementation
-
-    // NEON EOR3, which seems to be faster for larger payloads,
-    // but slower for smaller ones than v12e_v1
-    #[cfg(feature = "optimize_crc32_neon_eor3_v9s3x2e_s3")]
-    return build_neon_eor3_v9s3x2e_s3();
-
-    // NEON w/o EOR3, tuned for Apple M1, which is MUCH faster at smaller payloads, and slightly
-    // slower at larger ones, on my Apple M2 Ultra
-    #[cfg(feature = "optimize_crc32_neon_v12e_v1")]
-    return build_neon_v12e_v1();
-
-    // NEON w/o EOR3, tuned for Ampere Altra Arm (GCP Tau T2A)
-    #[cfg(feature = "optimize_crc32_neon_v3s4x2e_v2")]
-    return build_neon_v3s4x2e_v2();
-
-    // NEON w/EOR3 for large payloads (>1KiB), NEON w/o EOR3 for small ones
-    #[cfg(feature = "optimize_crc32_neon_blended")]
-    return build_neon_blended();
-
-    // no auto-optimize enabled, return and use the internal Rust implementation
-    #[cfg(feature = "optimize_crc32_auto")]
-    {
-        // for auto, default to NEON blended with EOR3 for large (>1KiB) payloads, w/o EOR3 for
-        // small ones
-        #[allow(unreachable_code)]
-        #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
-        if is_aarch64_feature_detected!("crc") && is_aarch64_feature_detected!("sha3") {
-            return build_neon_blended();
-        }
-
-        // for auto, fallback to non-EOR3 if SHA3 is not available
-        #[allow(unreachable_code)]
-        #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
-        if is_aarch64_feature_detected!("crc") {
-            build_neon_v12e_v1()
-        }
-    }
-
-    // fall through to internal Rust implementation
-}
-
-fn build_neon_blended() {
-    println!("Building NEON blended");
-
-    let flags = [String::from("-march=armv8.2-a+crypto+crc+sha3")];
-
-    build_optimized_target_crc32_iscsi("crc32_iscsi_neon_blended", &flags);
-    build_optimized_target_crc32_iso_hdlc("crc32_iso_hdlc_neon_blended", &flags);
-}
-
-fn build_neon_eor3_v9s3x2e_s3() {
-    println!("Building NEON EOR3 v9s3x2e s3");
-
-    let flags = [String::from("-march=armv8.2-a+crypto+crc+sha3")];
-
-    build_optimized_target_crc32_iscsi("crc32_iscsi_neon_eor3_v9s3x2e_s3", &flags);
-    build_optimized_target_crc32_iso_hdlc("crc32_iso_hdlc_neon_eor3_v9s3x2e_s3", &flags);
-}
-
-fn build_neon_v12e_v1() {
-    println!("Building NEON v12e v1");
-
-    let flags = [String::from("-march=armv8-a+crypto+crc")];
-
-    build_optimized_target_crc32_iscsi("crc32_iscsi_neon_v12e_v1", &flags);
-    build_optimized_target_crc32_iso_hdlc("crc32_iso_hdlc_neon_v12e_v1", &flags);
-}
-
-fn build_neon_v3s4x2e_v2() {
-    println!("Building NEON v12e v1");
-
-    let flags = [String::from("-march=armv8-a+crypto+crc")];
-
-    build_optimized_target_crc32_iscsi("crc32_iscsi_neon_v3s4x2e_v2", &flags);
-    build_optimized_target_crc32_iso_hdlc("crc32_iso_hdlc_neon_v3s4x2e_v2", &flags);
-}
-
-fn build_optimized_x86() {
-    // feature flag overrides to allow forcing a specific implementation
-
-    #[cfg(feature = "optimize_crc32_avx512_vpclmulqdq_v3x2")]
-    return build_avx512_vpclmulqdq_v3x2();
-
-    #[cfg(feature = "optimize_crc32_avx512_v4s3x3")]
-    return build_avx512_v4s3x3();
-
-    #[cfg(feature = "optimize_crc32_sse_v4s3x3")]
-    return build_sse_v4s3x3();
-
-    // no auto-optimize enabled, return and use the internal Rust implementation
-    #[cfg(feature = "optimize_crc32_auto")]
-    {
-        // for auto, default to the best available implementation based on CPU features
-
-        // in build scripts, the target architecture is only available via an environment variable
-        let target_arch = env::var("CARGO_CFG_TARGET_ARCH").unwrap();
-        if "x86" == target_arch {
-            // this is the only one supported on 32-bit x86 systems
-            crate::build_sse_v4s3x3()
-        }
-
-        #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
-        if is_x86_feature_detected!("vpclmulqdq")
-            && is_x86_feature_detected!("avx512vl")
-            && is_x86_feature_detected!("avx512f")
-        {
-            return build_avx512_vpclmulqdq_v3x2();
-        }
-
-        #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
-        if is_x86_feature_detected!("avx512vl")
-            && is_x86_feature_detected!("avx512f")
-            && is_x86_feature_detected!("pclmulqdq")
-        {
-            return crate::build_avx512_v4s3x3();
-        }
-
-        #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
-        if is_x86_feature_detected!("sse4.2") && is_x86_feature_detected!("pclmulqdq") {
-            crate::build_sse_v4s3x3()
-        }
-    }
-
-    // fall through to internal Rust implementation
-}
-
-fn build_avx512_vpclmulqdq_v3x2() {
-    println!("Building AVX512 VPCLMULQDQ v3x2");
-
-    let flags = [
-        String::from("-msse4.2"),
-        String::from("-mpclmul"),
-        String::from("-mavx512f"),
-        String::from("-mavx512vl"),
-        String::from("-mvpclmulqdq"),
-    ];
-
-    build_optimized_target_crc32_iscsi("crc32_iscsi_avx512_vpclmulqdq_v3x2", &flags);
-    build_optimized_target_crc32_iso_hdlc("crc32_iso_hdlc_avx512_vpclmulqdq_v3x2", &flags);
-}
-
-fn build_avx512_v4s3x3() {
-    println!("Building AVX512 v4s3x3");
-
-    let flags = [
-        String::from("-msse4.2"),
-        String::from("-mpclmul"),
-        String::from("-mavx512f"),
-        String::from("-mavx512vl"),
-    ];
-
-    build_optimized_target_crc32_iscsi("crc32_iscsi_avx512_v4s3x3", &flags);
-    build_optimized_target_crc32_iso_hdlc("crc32_iso_hdlc_avx512_v4s3x3", &flags);
-}
-
-fn build_sse_v4s3x3() {
-    println!("Building SSE v4s3x3 for x86 / x86_64");
-
-    let flags = [String::from("-msse4.2"), String::from("-mpclmul")];
-
-    build_optimized_target_crc32_iscsi("crc32_iscsi_sse_v4s3x3", &flags);
-    build_optimized_target_crc32_iso_hdlc("crc32_iso_hdlc_sse_v4s3x3", &flags);
-}
diff --git a/include/crc32_iscsi.h b/include/crc32_iscsi.h
deleted file mode 100644
index 142dc22..0000000
--- a/include/crc32_iscsi.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/* Generated header for hardware-accelerated CRC-32/ISCSI implementation */
-/* Original implementation from https://github.com/corsix/fast-crc32/ */
-/* MIT licensed */
-
-#ifndef CRC32_ISCSI_H
-#define CRC32_ISCSI_H
-
-#include <stddef.h>
-#include <stdint.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*
- * The target build properties (CPU architecture and fine-tuning parameters) for the compiled implementation.
- */
-extern const char *const ISCSI_TARGET;
-
-/**
- * Gets the target build properties (CPU architecture and fine-tuning parameters) for this implementation.
- */
-const char *get_iscsi_target(void);
-
-/**
- * Calculate CRC-32/ISCSI checksum using hardware acceleration
- *
- * @param crc0 Initial CRC value (typically 0)
- * @param buf Pointer to input data buffer
- * @param len Length of input data in bytes
- *
- * @return Calculated CRC-32/ISCSI checksum
- */
-uint32_t crc32_iscsi_impl(uint32_t crc0, const char* buf, size_t len);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* CRC32_ISCSI_H */
\ No newline at end of file
diff --git a/include/crc32_iscsi_avx512_v4s3x3.c b/include/crc32_iscsi_avx512_v4s3x3.c
deleted file mode 100644
index ce0b39f..0000000
--- a/include/crc32_iscsi_avx512_v4s3x3.c
+++ /dev/null
@@ -1,149 +0,0 @@
-/* Generated by https://github.com/corsix/fast-crc32/ using: */
-/* ./generate -i avx512 -p crc32c -a v4s3x3 */
-/* Modified slightly post-generation to improve function name and include build target */
-/* MIT licensed */
-
-#include <stddef.h>
-#include <stdint.h>
-#include <nmmintrin.h>
-#include <wmmintrin.h>
-#include <immintrin.h>
-
-#if defined(_MSC_VER)
-#define CRC_AINLINE static __forceinline
-#define CRC_ALIGN(n) __declspec(align(n))
-#else
-#define CRC_AINLINE static __inline __attribute__((always_inline))
-#define CRC_ALIGN(n) __attribute__((aligned(n)))
-#endif
-#define CRC_EXPORT extern
-
-#define clmul_lo(a, b) (_mm_clmulepi64_si128((a), (b), 0))
-#define clmul_hi(a, b) (_mm_clmulepi64_si128((a), (b), 17))
-
-const char *const ISCSI_TARGET = "x86_64_avx512_v4s3x3";
-
-const char *get_iscsi_target() {
-    return ISCSI_TARGET;
-}
-
-CRC_AINLINE __m128i clmul_scalar(uint32_t a, uint32_t b) {
-  return _mm_clmulepi64_si128(_mm_cvtsi32_si128(a), _mm_cvtsi32_si128(b), 0);
-}
-
-static uint32_t xnmodp(uint64_t n) /* x^n mod P, in log(n) time */ {
-  uint64_t stack = ~(uint64_t)1;
-  uint32_t acc, low;
-  for (; n > 191; n = (n >> 1) - 16) {
-    stack = (stack << 1) + (n & 1);
-  }
-  stack = ~stack;
-  acc = ((uint32_t)0x80000000) >> (n & 31);
-  for (n >>= 5; n; --n) {
-    acc = _mm_crc32_u32(acc, 0);
-  }
-  while ((low = stack & 1), stack >>= 1) {
-    __m128i x = _mm_cvtsi32_si128(acc);
-    uint64_t y = _mm_cvtsi128_si64(_mm_clmulepi64_si128(x, x, 0));
-    acc = _mm_crc32_u64(0, y << low);
-  }
-  return acc;
-}
-
-CRC_AINLINE __m128i crc_shift(uint32_t crc, size_t nbytes) {
-  return clmul_scalar(crc, xnmodp(nbytes * 8 - 33));
-}
-
-CRC_EXPORT uint32_t crc32_iscsi_impl(uint32_t crc0, const char* buf, size_t len) {
-  crc0 = ~crc0;
-  for (; len && ((uintptr_t)buf & 7); --len) {
-    crc0 = _mm_crc32_u8(crc0, *buf++);
-  }
-  if (((uintptr_t)buf & 8) && len >= 8) {
-    crc0 = _mm_crc32_u64(crc0, *(const uint64_t*)buf);
-    buf += 8;
-    len -= 8;
-  }
-  if (len >= 144) {
-    size_t blk = (len - 8) / 136;
-    size_t klen = blk * 24;
-    const char* buf2 = buf + 0;
-    uint32_t crc1 = 0;
-    uint32_t crc2 = 0;
-    __m128i vc0;
-    __m128i vc1;
-    uint64_t vc;
-    /* First vector chunk. */
-    __m128i x0 = _mm_loadu_si128((const __m128i*)buf2), y0;
-    __m128i x1 = _mm_loadu_si128((const __m128i*)(buf2 + 16)), y1;
-    __m128i x2 = _mm_loadu_si128((const __m128i*)(buf2 + 32)), y2;
-    __m128i x3 = _mm_loadu_si128((const __m128i*)(buf2 + 48)), y3;
-    __m128i k;
-    k = _mm_setr_epi32(0x740eef02, 0, 0x9e4addf8, 0);
-    x0 = _mm_xor_si128(_mm_cvtsi32_si128(crc0), x0);
-    crc0 = 0;
-    buf2 += 64;
-    len -= 136;
-    buf += blk * 64;
-    /* Main loop. */
-    while (len >= 144) {
-      y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
-      y1 = clmul_lo(x1, k), x1 = clmul_hi(x1, k);
-      y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k);
-      y3 = clmul_lo(x3, k), x3 = clmul_hi(x3, k);
-      x0 = _mm_ternarylogic_epi64(x0, y0, _mm_loadu_si128((const __m128i*)buf2), 0x96);
-      x1 = _mm_ternarylogic_epi64(x1, y1, _mm_loadu_si128((const __m128i*)(buf2 + 16)), 0x96);
-      x2 = _mm_ternarylogic_epi64(x2, y2, _mm_loadu_si128((const __m128i*)(buf2 + 32)), 0x96);
-      x3 = _mm_ternarylogic_epi64(x3, y3, _mm_loadu_si128((const __m128i*)(buf2 + 48)), 0x96);
-      crc0 = _mm_crc32_u64(crc0, *(const uint64_t*)buf);
-      crc1 = _mm_crc32_u64(crc1, *(const uint64_t*)(buf + klen));
-      crc2 = _mm_crc32_u64(crc2, *(const uint64_t*)(buf + klen * 2));
-      crc0 = _mm_crc32_u64(crc0, *(const uint64_t*)(buf + 8));
-      crc1 = _mm_crc32_u64(crc1, *(const uint64_t*)(buf + klen + 8));
-      crc2 = _mm_crc32_u64(crc2, *(const uint64_t*)(buf + klen * 2 + 8));
-      crc0 = _mm_crc32_u64(crc0, *(const uint64_t*)(buf + 16));
-      crc1 = _mm_crc32_u64(crc1, *(const uint64_t*)(buf + klen + 16));
-      crc2 = _mm_crc32_u64(crc2, *(const uint64_t*)(buf + klen * 2 + 16));
-      buf += 24;
-      buf2 += 64;
-      len -= 136;
-    }
-    /* Reduce x0 ... x3 to just x0. */
-    k = _mm_setr_epi32(0xf20c0dfe, 0, 0x493c7d27, 0);
-    y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
-    y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k);
-    x0 = _mm_ternarylogic_epi64(x0, y0, x1, 0x96);
-    x2 = _mm_ternarylogic_epi64(x2, y2, x3, 0x96);
-    k = _mm_setr_epi32(0x3da6d0cb, 0, 0xba4fc28e, 0);
-    y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
-    x0 = _mm_ternarylogic_epi64(x0, y0, x2, 0x96);
-    /* Final scalar chunk. */
-    crc0 = _mm_crc32_u64(crc0, *(const uint64_t*)buf);
-    crc1 = _mm_crc32_u64(crc1, *(const uint64_t*)(buf + klen));
-    crc2 = _mm_crc32_u64(crc2, *(const uint64_t*)(buf + klen * 2));
-    crc0 = _mm_crc32_u64(crc0, *(const uint64_t*)(buf + 8));
-    crc1 = _mm_crc32_u64(crc1, *(const uint64_t*)(buf + klen + 8));
-    crc2 = _mm_crc32_u64(crc2, *(const uint64_t*)(buf + klen * 2 + 8));
-    crc0 = _mm_crc32_u64(crc0, *(const uint64_t*)(buf + 16));
-    crc1 = _mm_crc32_u64(crc1, *(const uint64_t*)(buf + klen + 16));
-    crc2 = _mm_crc32_u64(crc2, *(const uint64_t*)(buf + klen * 2 + 16));
-    buf += 24;
-    vc0 = crc_shift(crc0, klen * 2 + 8);
-    vc1 = crc_shift(crc1, klen + 8);
-    vc = _mm_extract_epi64(_mm_xor_si128(vc0, vc1), 0);
-    /* Reduce 128 bits to 32 bits, and multiply by x^32. */
-    vc ^= _mm_extract_epi64(crc_shift(_mm_crc32_u64(_mm_crc32_u64(0, _mm_extract_epi64(x0, 0)), _mm_extract_epi64(x0, 1)), klen * 3 + 8), 0);
-    /* Final 8 bytes. */
-    buf += klen * 2;
-    crc0 = crc2;
-    crc0 = _mm_crc32_u64(crc0, *(const uint64_t*)buf ^ vc), buf += 8;
-    len -= 8;
-  }
-  for (; len >= 8; buf += 8, len -= 8) {
-    crc0 = _mm_crc32_u64(crc0, *(const uint64_t*)buf);
-  }
-  for (; len; --len) {
-    crc0 = _mm_crc32_u8(crc0, *buf++);
-  }
-  return ~crc0;
-}
diff --git a/include/crc32_iscsi_avx512_vpclmulqdq_v3x2.c b/include/crc32_iscsi_avx512_vpclmulqdq_v3x2.c
deleted file mode 100644
index 9ff1d8a..0000000
--- a/include/crc32_iscsi_avx512_vpclmulqdq_v3x2.c
+++ /dev/null
@@ -1,97 +0,0 @@
-/* Generated by https://github.com/corsix/fast-crc32/ using: */
-/* ./generate -i avx512_vpclmulqdq -p crc32c -a v3x2 */
-/* Modified slightly post-generation to improve function name and include build target */
-/* MIT licensed */
-
-#include <stddef.h>
-#include <stdint.h>
-#include <nmmintrin.h>
-#include <immintrin.h>
-
-#if defined(_MSC_VER)
-#define CRC_AINLINE static __forceinline
-#define CRC_ALIGN(n) __declspec(align(n))
-#else
-#define CRC_AINLINE static __inline __attribute__((always_inline))
-#define CRC_ALIGN(n) __attribute__((aligned(n)))
-#endif
-#define CRC_EXPORT extern
-
-#define clmul_lo(a, b) (_mm512_clmulepi64_epi128((a), (b), 0))
-#define clmul_hi(a, b) (_mm512_clmulepi64_epi128((a), (b), 17))
-
-const char *const ISCSI_TARGET = "x86_64_avx512_vpclmulqdq_v3x2";
-
-const char *get_iscsi_target() {
-    return ISCSI_TARGET;
-}
-
-CRC_EXPORT uint32_t crc32_iscsi_impl(uint32_t crc0, const char* buf, size_t len) {
-  crc0 = ~crc0;
-  for (; len && ((uintptr_t)buf & 7); --len) {
-    crc0 = _mm_crc32_u8(crc0, *buf++);
-  }
-  while (((uintptr_t)buf & 56) && len >= 8) {
-    crc0 = _mm_crc32_u64(crc0, *(const uint64_t*)buf);
-    buf += 8;
-    len -= 8;
-  }
-  if (len >= 384) {
-    __m128i z0;
-    /* First vector chunk. */
-    __m512i x0 = _mm512_loadu_si512((const void*)buf), y0;
-    __m512i x1 = _mm512_loadu_si512((const void*)(buf + 64)), y1;
-    __m512i x2 = _mm512_loadu_si512((const void*)(buf + 128)), y2;
-    __m512i k;
-    k = _mm512_broadcast_i32x4(_mm_setr_epi32(0xa87ab8a8, 0, 0xab7aff2a, 0));
-    x0 = _mm512_xor_si512(_mm512_castsi128_si512(_mm_cvtsi32_si128(crc0)), x0);
-    y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
-    y1 = clmul_lo(x1, k), x1 = clmul_hi(x1, k);
-    y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k);
-    x0 = _mm512_ternarylogic_epi64(x0, y0, _mm512_loadu_si512((const void*)(buf + 192)), 0x96);
-    x1 = _mm512_ternarylogic_epi64(x1, y1, _mm512_loadu_si512((const void*)(buf + 256)), 0x96);
-    x2 = _mm512_ternarylogic_epi64(x2, y2, _mm512_loadu_si512((const void*)(buf + 320)), 0x96);
-    buf += 384;
-    len -= 384;
-    /* Main loop. */
-    while (len >= 384) {
-      y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
-      y1 = clmul_lo(x1, k), x1 = clmul_hi(x1, k);
-      y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k);
-      x0 = _mm512_ternarylogic_epi64(x0, y0, _mm512_loadu_si512((const void*)buf), 0x96);
-      x1 = _mm512_ternarylogic_epi64(x1, y1, _mm512_loadu_si512((const void*)(buf + 64)), 0x96);
-      x2 = _mm512_ternarylogic_epi64(x2, y2, _mm512_loadu_si512((const void*)(buf + 128)), 0x96);
-      y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
-      y1 = clmul_lo(x1, k), x1 = clmul_hi(x1, k);
-      y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k);
-      x0 = _mm512_ternarylogic_epi64(x0, y0, _mm512_loadu_si512((const void*)(buf + 192)), 0x96);
-      x1 = _mm512_ternarylogic_epi64(x1, y1, _mm512_loadu_si512((const void*)(buf + 256)), 0x96);
-      x2 = _mm512_ternarylogic_epi64(x2, y2, _mm512_loadu_si512((const void*)(buf + 320)), 0x96);
-      buf += 384;
-      len -= 384;
-    }
-    /* Reduce x0 ... x2 to just x0. */
-    k = _mm512_broadcast_i32x4(_mm_setr_epi32(0x740eef02, 0, 0x9e4addf8, 0));
-    y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
-    x0 = _mm512_ternarylogic_epi64(x0, y0, x1, 0x96);
-    x1 = x2;
-    y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
-    x0 = _mm512_ternarylogic_epi64(x0, y0, x1, 0x96);
-    /* Reduce 512 bits to 128 bits. */
-    k = _mm512_setr_epi32(0x1c291d04, 0, 0xddc0152b, 0, 0x3da6d0cb, 0, 0xba4fc28e, 0, 0xf20c0dfe, 0, 0x493c7d27, 0, 0, 0, 0, 0);
-    y0 = clmul_lo(x0, k), k = clmul_hi(x0, k);
-    y0 = _mm512_xor_si512(y0, k);
-    z0 = _mm_ternarylogic_epi64(_mm512_castsi512_si128(y0), _mm512_extracti32x4_epi32(y0, 1), _mm512_extracti32x4_epi32(y0, 2), 0x96);
-    z0 = _mm_xor_si128(z0, _mm512_extracti32x4_epi32(x0, 3));
-    /* Reduce 128 bits to 32 bits, and multiply by x^32. */
-    crc0 = _mm_crc32_u64(0, _mm_extract_epi64(z0, 0));
-    crc0 = _mm_crc32_u64(crc0, _mm_extract_epi64(z0, 1));
-  }
-  for (; len >= 8; buf += 8, len -= 8) {
-    crc0 = _mm_crc32_u64(crc0, *(const uint64_t*)buf);
-  }
-  for (; len; --len) {
-    crc0 = _mm_crc32_u8(crc0, *buf++);
-  }
-  return ~crc0;
-}
diff --git a/include/crc32_iscsi_neon_blended.c b/include/crc32_iscsi_neon_blended.c
deleted file mode 100644
index 6ae7572..0000000
--- a/include/crc32_iscsi_neon_blended.c
+++ /dev/null
@@ -1,319 +0,0 @@
-/* Generated by https://github.com/corsix/fast-crc32/ */
-/* Modified post-generation to improve function names, include build targets,
-   and bifurcate large (>1KiB) and small payloads for optimized performance */
-/* MIT licensed */
-
-#include <stddef.h>
-#include <stdint.h>
-#include <arm_acle.h>
-#include <arm_neon.h>
-
-#if defined(_MSC_VER)
-#define CRC_AINLINE static __forceinline
-#define CRC_ALIGN(n) __declspec(align(n))
-#else
-#define CRC_AINLINE static __inline __attribute__((always_inline))
-#define CRC_ALIGN(n) __attribute__((aligned(n)))
-#endif
-#define CRC_EXPORT extern
-
-const char *const ISCSI_TARGET = "aarch64_neon_blended";
-
-const char *get_iscsi_target() {
-    return ISCSI_TARGET;
-}
-
-CRC_AINLINE uint64x2_t clmul_lo_eor3(uint64x2_t a, uint64x2_t b) {
-  uint64x2_t r;
-  __asm("pmull %0.1q, %1.1d, %2.1d\n" : "=w"(r) : "w"(a), "w"(b));
-  return r;
-}
-
-CRC_AINLINE uint64x2_t clmul_hi_eor3(uint64x2_t a, uint64x2_t b) {
-  uint64x2_t r;
-  __asm("pmull2 %0.1q, %1.2d, %2.2d\n" : "=w"(r) : "w"(a), "w"(b));
-  return r;
-}
-
-CRC_AINLINE uint64x2_t clmul_scalar(uint32_t a, uint32_t b) {
-  uint64x2_t r;
-  __asm("pmull %0.1q, %1.1d, %2.1d\n" : "=w"(r) : "w"(vmovq_n_u64(a)), "w"(vmovq_n_u64(b)));
-  return r;
-}
-
-static uint32_t xnmodp(uint64_t n) /* x^n mod P, in log(n) time */ {
-  uint64_t stack = ~(uint64_t)1;
-  uint32_t acc, low;
-  for (; n > 191; n = (n >> 1) - 16) {
-    stack = (stack << 1) + (n & 1);
-  }
-  stack = ~stack;
-  acc = ((uint32_t)0x80000000) >> (n & 31);
-  for (n >>= 5; n; --n) {
-    acc = __crc32cw(acc, 0);
-  }
-  while ((low = stack & 1), stack >>= 1) {
-    poly8x8_t x = vreinterpret_p8_u64(vmov_n_u64(acc));
-    uint64_t y = vgetq_lane_u64(vreinterpretq_u64_p16(vmull_p8(x, x)), 0);
-    acc = __crc32cd(0, y << low);
-  }
-  return acc;
-}
-
-CRC_AINLINE uint64x2_t crc_shift(uint32_t crc, size_t nbytes) {
-  return clmul_scalar(crc, xnmodp(nbytes * 8 - 33));
-}
-
-CRC_AINLINE uint32_t crc32_iscsi_large_impl(uint32_t crc0, const char* buf, size_t len) {
-  crc0 = ~crc0;
-  for (; len && ((uintptr_t)buf & 7); --len) {
-    crc0 = __crc32cb(crc0, *buf++);
-  }
-  if (((uintptr_t)buf & 8) && len >= 8) {
-    crc0 = __crc32cd(crc0, *(const uint64_t*)buf);
-    buf += 8;
-    len -= 8;
-  }
-  if (len >= 192) {
-    const char* end = buf + len;
-    size_t blk = (len - 0) / 192;
-    size_t klen = blk * 16;
-    const char* buf2 = buf + klen * 3;
-    const char* limit = buf + klen - 32;
-    uint32_t crc1 = 0;
-    uint32_t crc2 = 0;
-    uint64x2_t vc0;
-    uint64x2_t vc1;
-    uint64x2_t vc2;
-    uint64_t vc;
-    /* First vector chunk. */
-    uint64x2_t x0 = vld1q_u64((const uint64_t*)buf2), y0;
-    uint64x2_t x1 = vld1q_u64((const uint64_t*)(buf2 + 16)), y1;
-    uint64x2_t x2 = vld1q_u64((const uint64_t*)(buf2 + 32)), y2;
-    uint64x2_t x3 = vld1q_u64((const uint64_t*)(buf2 + 48)), y3;
-    uint64x2_t x4 = vld1q_u64((const uint64_t*)(buf2 + 64)), y4;
-    uint64x2_t x5 = vld1q_u64((const uint64_t*)(buf2 + 80)), y5;
-    uint64x2_t x6 = vld1q_u64((const uint64_t*)(buf2 + 96)), y6;
-    uint64x2_t x7 = vld1q_u64((const uint64_t*)(buf2 + 112)), y7;
-    uint64x2_t x8 = vld1q_u64((const uint64_t*)(buf2 + 128)), y8;
-    uint64x2_t k;
-    { static const uint64_t CRC_ALIGN(16) k_[] = {0x7e908048, 0xc96cfdc0}; k = vld1q_u64(k_); }
-    buf2 += 144;
-    /* Main loop. */
-    while (buf <= limit) {
-      y0 = clmul_lo_eor3(x0, k), x0 = clmul_hi_eor3(x0, k);
-      y1 = clmul_lo_eor3(x1, k), x1 = clmul_hi_eor3(x1, k);
-      y2 = clmul_lo_eor3(x2, k), x2 = clmul_hi_eor3(x2, k);
-      y3 = clmul_lo_eor3(x3, k), x3 = clmul_hi_eor3(x3, k);
-      y4 = clmul_lo_eor3(x4, k), x4 = clmul_hi_eor3(x4, k);
-      y5 = clmul_lo_eor3(x5, k), x5 = clmul_hi_eor3(x5, k);
-      y6 = clmul_lo_eor3(x6, k), x6 = clmul_hi_eor3(x6, k);
-      y7 = clmul_lo_eor3(x7, k), x7 = clmul_hi_eor3(x7, k);
-      y8 = clmul_lo_eor3(x8, k), x8 = clmul_hi_eor3(x8, k);
-      x0 = veor3q_u64(x0, y0, vld1q_u64((const uint64_t*)buf2));
-      x1 = veor3q_u64(x1, y1, vld1q_u64((const uint64_t*)(buf2 + 16)));
-      x2 = veor3q_u64(x2, y2, vld1q_u64((const uint64_t*)(buf2 + 32)));
-      x3 = veor3q_u64(x3, y3, vld1q_u64((const uint64_t*)(buf2 + 48)));
-      x4 = veor3q_u64(x4, y4, vld1q_u64((const uint64_t*)(buf2 + 64)));
-      x5 = veor3q_u64(x5, y5, vld1q_u64((const uint64_t*)(buf2 + 80)));
-      x6 = veor3q_u64(x6, y6, vld1q_u64((const uint64_t*)(buf2 + 96)));
-      x7 = veor3q_u64(x7, y7, vld1q_u64((const uint64_t*)(buf2 + 112)));
-      x8 = veor3q_u64(x8, y8, vld1q_u64((const uint64_t*)(buf2 + 128)));
-      crc0 = __crc32cd(crc0, *(const uint64_t*)buf);
-      crc1 = __crc32cd(crc1, *(const uint64_t*)(buf + klen));
-      crc2 = __crc32cd(crc2, *(const uint64_t*)(buf + klen * 2));
-      crc0 = __crc32cd(crc0, *(const uint64_t*)(buf + 8));
-      crc1 = __crc32cd(crc1, *(const uint64_t*)(buf + klen + 8));
-      crc2 = __crc32cd(crc2, *(const uint64_t*)(buf + klen * 2 + 8));
-      buf += 16;
-      buf2 += 144;
-    }
-    /* Reduce x0 ... x8 to just x0. */
-    { static const uint64_t CRC_ALIGN(16) k_[] = {0xf20c0dfe, 0x493c7d27}; k = vld1q_u64(k_); }
-    y0 = clmul_lo_eor3(x0, k), x0 = clmul_hi_eor3(x0, k);
-    x0 = veor3q_u64(x0, y0, x1);
-    x1 = x2, x2 = x3, x3 = x4, x4 = x5, x5 = x6, x6 = x7, x7 = x8;
-    y0 = clmul_lo_eor3(x0, k), x0 = clmul_hi_eor3(x0, k);
-    y2 = clmul_lo_eor3(x2, k), x2 = clmul_hi_eor3(x2, k);
-    y4 = clmul_lo_eor3(x4, k), x4 = clmul_hi_eor3(x4, k);
-    y6 = clmul_lo_eor3(x6, k), x6 = clmul_hi_eor3(x6, k);
-    x0 = veor3q_u64(x0, y0, x1);
-    x2 = veor3q_u64(x2, y2, x3);
-    x4 = veor3q_u64(x4, y4, x5);
-    x6 = veor3q_u64(x6, y6, x7);
-    { static const uint64_t CRC_ALIGN(16) k_[] = {0x3da6d0cb, 0xba4fc28e}; k = vld1q_u64(k_); }
-    y0 = clmul_lo_eor3(x0, k), x0 = clmul_hi_eor3(x0, k);
-    y4 = clmul_lo_eor3(x4, k), x4 = clmul_hi_eor3(x4, k);
-    x0 = veor3q_u64(x0, y0, x2);
-    x4 = veor3q_u64(x4, y4, x6);
-    { static const uint64_t CRC_ALIGN(16) k_[] = {0x740eef02, 0x9e4addf8}; k = vld1q_u64(k_); }
-    y0 = clmul_lo_eor3(x0, k), x0 = clmul_hi_eor3(x0, k);
-    x0 = veor3q_u64(x0, y0, x4);
-    /* Final scalar chunk. */
-    crc0 = __crc32cd(crc0, *(const uint64_t*)buf);
-    crc1 = __crc32cd(crc1, *(const uint64_t*)(buf + klen));
-    crc2 = __crc32cd(crc2, *(const uint64_t*)(buf + klen * 2));
-    crc0 = __crc32cd(crc0, *(const uint64_t*)(buf + 8));
-    crc1 = __crc32cd(crc1, *(const uint64_t*)(buf + klen + 8));
-    crc2 = __crc32cd(crc2, *(const uint64_t*)(buf + klen * 2 + 8));
-    vc0 = crc_shift(crc0, klen * 2 + blk * 144);
-    vc1 = crc_shift(crc1, klen + blk * 144);
-    vc2 = crc_shift(crc2, 0 + blk * 144);
-    vc = vgetq_lane_u64(veor3q_u64(vc0, vc1, vc2), 0);
-    /* Reduce 128 bits to 32 bits, and multiply by x^32. */
-    crc0 = __crc32cd(0, vgetq_lane_u64(x0, 0));
-    crc0 = __crc32cd(crc0, vc ^ vgetq_lane_u64(x0, 1));
-    buf = buf2;
-    len = end - buf;
-  }
-  if (len >= 32) {
-    size_t klen = ((len - 8) / 24) * 8;
-    uint32_t crc1 = 0;
-    uint32_t crc2 = 0;
-    uint64x2_t vc0;
-    uint64x2_t vc1;
-    uint64_t vc;
-    /* Main loop. */
-    do {
-      crc0 = __crc32cd(crc0, *(const uint64_t*)buf);
-      crc1 = __crc32cd(crc1, *(const uint64_t*)(buf + klen));
-      crc2 = __crc32cd(crc2, *(const uint64_t*)(buf + klen * 2));
-      buf += 8;
-      len -= 24;
-    } while (len >= 32);
-    vc0 = crc_shift(crc0, klen * 2 + 8);
-    vc1 = crc_shift(crc1, klen + 8);
-    vc = vgetq_lane_u64(veorq_u64(vc0, vc1), 0);
-    /* Final 8 bytes. */
-    buf += klen * 2;
-    crc0 = crc2;
-    crc0 = __crc32cd(crc0, *(const uint64_t*)buf ^ vc), buf += 8;
-    len -= 8;
-  }
-  for (; len >= 8; buf += 8, len -= 8) {
-    crc0 = __crc32cd(crc0, *(const uint64_t*)buf);
-  }
-  for (; len; --len) {
-    crc0 = __crc32cb(crc0, *buf++);
-  }
-  return ~crc0;
-}
-
-
-CRC_AINLINE uint64x2_t clmul_lo_e(uint64x2_t a, uint64x2_t b, uint64x2_t c) {
-  uint64x2_t r;
-  __asm("pmull %0.1q, %2.1d, %3.1d\neor %0.16b, %0.16b, %1.16b\n" : "=w"(r), "+w"(c) : "w"(a), "w"(b));
-  return r;
-}
-
-CRC_AINLINE uint64x2_t clmul_hi_e(uint64x2_t a, uint64x2_t b, uint64x2_t c) {
-  uint64x2_t r;
-  __asm("pmull2 %0.1q, %2.2d, %3.2d\neor %0.16b, %0.16b, %1.16b\n" : "=w"(r), "+w"(c) : "w"(a), "w"(b));
-  return r;
-}
-
-CRC_AINLINE uint32_t crc32_iscsi_small_impl(uint32_t crc0, const char* buf, size_t len) {
-  crc0 = ~crc0;
-  for (; len && ((uintptr_t)buf & 7); --len) {
-    crc0 = __crc32cb(crc0, *buf++);
-  }
-  if (((uintptr_t)buf & 8) && len >= 8) {
-    crc0 = __crc32cd(crc0, *(const uint64_t*)buf);
-    buf += 8;
-    len -= 8;
-  }
-  if (len >= 192) {
-    const char* end = buf + len;
-    const char* limit = buf + len - 192;
-    /* First vector chunk. */
-    uint64x2_t x0 = vld1q_u64((const uint64_t*)buf), y0;
-    uint64x2_t x1 = vld1q_u64((const uint64_t*)(buf + 16)), y1;
-    uint64x2_t x2 = vld1q_u64((const uint64_t*)(buf + 32)), y2;
-    uint64x2_t x3 = vld1q_u64((const uint64_t*)(buf + 48)), y3;
-    uint64x2_t x4 = vld1q_u64((const uint64_t*)(buf + 64)), y4;
-    uint64x2_t x5 = vld1q_u64((const uint64_t*)(buf + 80)), y5;
-    uint64x2_t x6 = vld1q_u64((const uint64_t*)(buf + 96)), y6;
-    uint64x2_t x7 = vld1q_u64((const uint64_t*)(buf + 112)), y7;
-    uint64x2_t x8 = vld1q_u64((const uint64_t*)(buf + 128)), y8;
-    uint64x2_t x9 = vld1q_u64((const uint64_t*)(buf + 144)), y9;
-    uint64x2_t x10 = vld1q_u64((const uint64_t*)(buf + 160)), y10;
-    uint64x2_t x11 = vld1q_u64((const uint64_t*)(buf + 176)), y11;
-    uint64x2_t k;
-    { static const uint64_t CRC_ALIGN(16) k_[] = {0xa87ab8a8, 0xab7aff2a}; k = vld1q_u64(k_); }
-    x0 = veorq_u64((uint64x2_t){crc0, 0}, x0);
-    buf += 192;
-    /* Main loop. */
-    while (buf <= limit) {
-      y0 = clmul_lo_e(x0, k, vld1q_u64((const uint64_t*)buf)), x0 = clmul_hi_e(x0, k, y0);
-      y1 = clmul_lo_e(x1, k, vld1q_u64((const uint64_t*)(buf + 16))), x1 = clmul_hi_e(x1, k, y1);
-      y2 = clmul_lo_e(x2, k, vld1q_u64((const uint64_t*)(buf + 32))), x2 = clmul_hi_e(x2, k, y2);
-      y3 = clmul_lo_e(x3, k, vld1q_u64((const uint64_t*)(buf + 48))), x3 = clmul_hi_e(x3, k, y3);
-      y4 = clmul_lo_e(x4, k, vld1q_u64((const uint64_t*)(buf + 64))), x4 = clmul_hi_e(x4, k, y4);
-      y5 = clmul_lo_e(x5, k, vld1q_u64((const uint64_t*)(buf + 80))), x5 = clmul_hi_e(x5, k, y5);
-      y6 = clmul_lo_e(x6, k, vld1q_u64((const uint64_t*)(buf + 96))), x6 = clmul_hi_e(x6, k, y6);
-      y7 = clmul_lo_e(x7, k, vld1q_u64((const uint64_t*)(buf + 112))), x7 = clmul_hi_e(x7, k, y7);
-      y8 = clmul_lo_e(x8, k, vld1q_u64((const uint64_t*)(buf + 128))), x8 = clmul_hi_e(x8, k, y8);
-      y9 = clmul_lo_e(x9, k, vld1q_u64((const uint64_t*)(buf + 144))), x9 = clmul_hi_e(x9, k, y9);
-      y10 = clmul_lo_e(x10, k, vld1q_u64((const uint64_t*)(buf + 160))), x10 = clmul_hi_e(x10, k, y10);
-      y11 = clmul_lo_e(x11, k, vld1q_u64((const uint64_t*)(buf + 176))), x11 = clmul_hi_e(x11, k, y11);
-      buf += 192;
-    }
-    /* Reduce x0 ... x11 to just x0. */
-    { static const uint64_t CRC_ALIGN(16) k_[] = {0xf20c0dfe, 0x493c7d27}; k = vld1q_u64(k_); }
-    y0 = clmul_lo_e(x0, k, x1), x0 = clmul_hi_e(x0, k, y0);
-    y2 = clmul_lo_e(x2, k, x3), x2 = clmul_hi_e(x2, k, y2);
-    y4 = clmul_lo_e(x4, k, x5), x4 = clmul_hi_e(x4, k, y4);
-    y6 = clmul_lo_e(x6, k, x7), x6 = clmul_hi_e(x6, k, y6);
-    y8 = clmul_lo_e(x8, k, x9), x8 = clmul_hi_e(x8, k, y8);
-    y10 = clmul_lo_e(x10, k, x11), x10 = clmul_hi_e(x10, k, y10);
-    { static const uint64_t CRC_ALIGN(16) k_[] = {0x3da6d0cb, 0xba4fc28e}; k = vld1q_u64(k_); }
-    y0 = clmul_lo_e(x0, k, x2), x0 = clmul_hi_e(x0, k, y0);
-    y4 = clmul_lo_e(x4, k, x6), x4 = clmul_hi_e(x4, k, y4);
-    y8 = clmul_lo_e(x8, k, x10), x8 = clmul_hi_e(x8, k, y8);
-    { static const uint64_t CRC_ALIGN(16) k_[] = {0x740eef02, 0x9e4addf8}; k = vld1q_u64(k_); }
-    y0 = clmul_lo_e(x0, k, x4), x0 = clmul_hi_e(x0, k, y0);
-    x4 = x8;
-    y0 = clmul_lo_e(x0, k, x4), x0 = clmul_hi_e(x0, k, y0);
-    /* Reduce 128 bits to 32 bits, and multiply by x^32. */
-    crc0 = __crc32cd(0, vgetq_lane_u64(x0, 0));
-    crc0 = __crc32cd(crc0, vgetq_lane_u64(x0, 1));
-    len = end - buf;
-  }
-  if (len >= 16) {
-    /* First vector chunk. */
-    uint64x2_t x0 = vld1q_u64((const uint64_t*)buf), y0;
-    uint64x2_t k;
-    { static const uint64_t CRC_ALIGN(16) k_[] = {0xf20c0dfe, 0x493c7d27}; k = vld1q_u64(k_); }
-    x0 = veorq_u64((uint64x2_t){crc0, 0}, x0);
-    buf += 16;
-    len -= 16;
-    /* Main loop. */
-    while (len >= 16) {
-      y0 = clmul_lo_e(x0, k, vld1q_u64((const uint64_t*)buf)), x0 = clmul_hi_e(x0, k, y0);
-      buf += 16;
-      len -= 16;
-    }
-    /* Reduce 128 bits to 32 bits, and multiply by x^32. */
-    crc0 = __crc32cd(0, vgetq_lane_u64(x0, 0));
-    crc0 = __crc32cd(crc0, vgetq_lane_u64(x0, 1));
-  }
-  for (; len >= 8; buf += 8, len -= 8) {
-    crc0 = __crc32cd(crc0, *(const uint64_t*)buf);
-  }
-  for (; len; --len) {
-    crc0 = __crc32cb(crc0, *buf++);
-  }
-  return ~crc0;
-}
-
-CRC_EXPORT uint32_t crc32_iscsi_impl(uint32_t crc0, const char* buf, size_t len) {
-    // Define 1 KiB threshold (1024 bytes)
-    const size_t LARGE_BUFFER_THRESHOLD = 1024;
-
-    // Select implementation based on buffer size
-    if (len <= LARGE_BUFFER_THRESHOLD) {
-        return crc32_iscsi_small_impl(crc0, buf, len);
-    } else {
-        return crc32_iscsi_large_impl(crc0, buf, len);
-    }
-}
diff --git a/include/crc32_iscsi_neon_eor3_v9s3x2e_s3.c b/include/crc32_iscsi_neon_eor3_v9s3x2e_s3.c
deleted file mode 100644
index 2f672e9..0000000
--- a/include/crc32_iscsi_neon_eor3_v9s3x2e_s3.c
+++ /dev/null
@@ -1,200 +0,0 @@
-/* Generated by https://github.com/corsix/fast-crc32/ using: */
-/* ./generate -i neon_eor3 -p crc32c -a v9s3x2e_s3 */
-/* Modified slightly post-generation to improve function name and include build target */
-/* MIT licensed */
-
-#include <stddef.h>
-#include <stdint.h>
-#include <arm_acle.h>
-#include <arm_neon.h>
-
-#if defined(_MSC_VER)
-#define CRC_AINLINE static __forceinline
-#define CRC_ALIGN(n) __declspec(align(n))
-#else
-#define CRC_AINLINE static __inline __attribute__((always_inline))
-#define CRC_ALIGN(n) __attribute__((aligned(n)))
-#endif
-#define CRC_EXPORT extern
-
-const char *const ISCSI_TARGET = "aarch64_neon_eor3_v9s3x2e_s3";
-
-const char *get_iscsi_target() {
-    return ISCSI_TARGET;
-}
-
-CRC_AINLINE uint64x2_t clmul_lo(uint64x2_t a, uint64x2_t b) {
-  uint64x2_t r;
-  __asm("pmull %0.1q, %1.1d, %2.1d\n" : "=w"(r) : "w"(a), "w"(b));
-  return r;
-}
-
-CRC_AINLINE uint64x2_t clmul_hi(uint64x2_t a, uint64x2_t b) {
-  uint64x2_t r;
-  __asm("pmull2 %0.1q, %1.2d, %2.2d\n" : "=w"(r) : "w"(a), "w"(b));
-  return r;
-}
-
-CRC_AINLINE uint64x2_t clmul_scalar(uint32_t a, uint32_t b) {
-  uint64x2_t r;
-  __asm("pmull %0.1q, %1.1d, %2.1d\n" : "=w"(r) : "w"(vmovq_n_u64(a)), "w"(vmovq_n_u64(b)));
-  return r;
-}
-
-static uint32_t xnmodp(uint64_t n) /* x^n mod P, in log(n) time */ {
-  uint64_t stack = ~(uint64_t)1;
-  uint32_t acc, low;
-  for (; n > 191; n = (n >> 1) - 16) {
-    stack = (stack << 1) + (n & 1);
-  }
-  stack = ~stack;
-  acc = ((uint32_t)0x80000000) >> (n & 31);
-  for (n >>= 5; n; --n) {
-    acc = __crc32cw(acc, 0);
-  }
-  while ((low = stack & 1), stack >>= 1) {
-    poly8x8_t x = vreinterpret_p8_u64(vmov_n_u64(acc));
-    uint64_t y = vgetq_lane_u64(vreinterpretq_u64_p16(vmull_p8(x, x)), 0);
-    acc = __crc32cd(0, y << low);
-  }
-  return acc;
-}
-
-CRC_AINLINE uint64x2_t crc_shift(uint32_t crc, size_t nbytes) {
-  return clmul_scalar(crc, xnmodp(nbytes * 8 - 33));
-}
-
-CRC_EXPORT uint32_t crc32_iscsi_impl(uint32_t crc0, const char* buf, size_t len) {
-  crc0 = ~crc0;
-  for (; len && ((uintptr_t)buf & 7); --len) {
-    crc0 = __crc32cb(crc0, *buf++);
-  }
-  if (((uintptr_t)buf & 8) && len >= 8) {
-    crc0 = __crc32cd(crc0, *(const uint64_t*)buf);
-    buf += 8;
-    len -= 8;
-  }
-  if (len >= 192) {
-    const char* end = buf + len;
-    size_t blk = (len - 0) / 192;
-    size_t klen = blk * 16;
-    const char* buf2 = buf + klen * 3;
-    const char* limit = buf + klen - 32;
-    uint32_t crc1 = 0;
-    uint32_t crc2 = 0;
-    uint64x2_t vc0;
-    uint64x2_t vc1;
-    uint64x2_t vc2;
-    uint64_t vc;
-    /* First vector chunk. */
-    uint64x2_t x0 = vld1q_u64((const uint64_t*)buf2), y0;
-    uint64x2_t x1 = vld1q_u64((const uint64_t*)(buf2 + 16)), y1;
-    uint64x2_t x2 = vld1q_u64((const uint64_t*)(buf2 + 32)), y2;
-    uint64x2_t x3 = vld1q_u64((const uint64_t*)(buf2 + 48)), y3;
-    uint64x2_t x4 = vld1q_u64((const uint64_t*)(buf2 + 64)), y4;
-    uint64x2_t x5 = vld1q_u64((const uint64_t*)(buf2 + 80)), y5;
-    uint64x2_t x6 = vld1q_u64((const uint64_t*)(buf2 + 96)), y6;
-    uint64x2_t x7 = vld1q_u64((const uint64_t*)(buf2 + 112)), y7;
-    uint64x2_t x8 = vld1q_u64((const uint64_t*)(buf2 + 128)), y8;
-    uint64x2_t k;
-    { static const uint64_t CRC_ALIGN(16) k_[] = {0x7e908048, 0xc96cfdc0}; k = vld1q_u64(k_); }
-    buf2 += 144;
-    /* Main loop. */
-    while (buf <= limit) {
-      y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
-      y1 = clmul_lo(x1, k), x1 = clmul_hi(x1, k);
-      y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k);
-      y3 = clmul_lo(x3, k), x3 = clmul_hi(x3, k);
-      y4 = clmul_lo(x4, k), x4 = clmul_hi(x4, k);
-      y5 = clmul_lo(x5, k), x5 = clmul_hi(x5, k);
-      y6 = clmul_lo(x6, k), x6 = clmul_hi(x6, k);
-      y7 = clmul_lo(x7, k), x7 = clmul_hi(x7, k);
-      y8 = clmul_lo(x8, k), x8 = clmul_hi(x8, k);
-      x0 = veor3q_u64(x0, y0, vld1q_u64((const uint64_t*)buf2));
-      x1 = veor3q_u64(x1, y1, vld1q_u64((const uint64_t*)(buf2 + 16)));
-      x2 = veor3q_u64(x2, y2, vld1q_u64((const uint64_t*)(buf2 + 32)));
-      x3 = veor3q_u64(x3, y3, vld1q_u64((const uint64_t*)(buf2 + 48)));
-      x4 = veor3q_u64(x4, y4, vld1q_u64((const uint64_t*)(buf2 + 64)));
-      x5 = veor3q_u64(x5, y5, vld1q_u64((const uint64_t*)(buf2 + 80)));
-      x6 = veor3q_u64(x6, y6, vld1q_u64((const uint64_t*)(buf2 + 96)));
-      x7 = veor3q_u64(x7, y7, vld1q_u64((const uint64_t*)(buf2 + 112)));
-      x8 = veor3q_u64(x8, y8, vld1q_u64((const uint64_t*)(buf2 + 128)));
-      crc0 = __crc32cd(crc0, *(const uint64_t*)buf);
-      crc1 = __crc32cd(crc1, *(const uint64_t*)(buf + klen));
-      crc2 = __crc32cd(crc2, *(const uint64_t*)(buf + klen * 2));
-      crc0 = __crc32cd(crc0, *(const uint64_t*)(buf + 8));
-      crc1 = __crc32cd(crc1, *(const uint64_t*)(buf + klen + 8));
-      crc2 = __crc32cd(crc2, *(const uint64_t*)(buf + klen * 2 + 8));
-      buf += 16;
-      buf2 += 144;
-    }
-    /* Reduce x0 ... x8 to just x0. */
-    { static const uint64_t CRC_ALIGN(16) k_[] = {0xf20c0dfe, 0x493c7d27}; k = vld1q_u64(k_); }
-    y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
-    x0 = veor3q_u64(x0, y0, x1);
-    x1 = x2, x2 = x3, x3 = x4, x4 = x5, x5 = x6, x6 = x7, x7 = x8;
-    y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
-    y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k);
-    y4 = clmul_lo(x4, k), x4 = clmul_hi(x4, k);
-    y6 = clmul_lo(x6, k), x6 = clmul_hi(x6, k);
-    x0 = veor3q_u64(x0, y0, x1);
-    x2 = veor3q_u64(x2, y2, x3);
-    x4 = veor3q_u64(x4, y4, x5);
-    x6 = veor3q_u64(x6, y6, x7);
-    { static const uint64_t CRC_ALIGN(16) k_[] = {0x3da6d0cb, 0xba4fc28e}; k = vld1q_u64(k_); }
-    y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
-    y4 = clmul_lo(x4, k), x4 = clmul_hi(x4, k);
-    x0 = veor3q_u64(x0, y0, x2);
-    x4 = veor3q_u64(x4, y4, x6);
-    { static const uint64_t CRC_ALIGN(16) k_[] = {0x740eef02, 0x9e4addf8}; k = vld1q_u64(k_); }
-    y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
-    x0 = veor3q_u64(x0, y0, x4);
-    /* Final scalar chunk. */
-    crc0 = __crc32cd(crc0, *(const uint64_t*)buf);
-    crc1 = __crc32cd(crc1, *(const uint64_t*)(buf + klen));
-    crc2 = __crc32cd(crc2, *(const uint64_t*)(buf + klen * 2));
-    crc0 = __crc32cd(crc0, *(const uint64_t*)(buf + 8));
-    crc1 = __crc32cd(crc1, *(const uint64_t*)(buf + klen + 8));
-    crc2 = __crc32cd(crc2, *(const uint64_t*)(buf + klen * 2 + 8));
-    vc0 = crc_shift(crc0, klen * 2 + blk * 144);
-    vc1 = crc_shift(crc1, klen + blk * 144);
-    vc2 = crc_shift(crc2, 0 + blk * 144);
-    vc = vgetq_lane_u64(veor3q_u64(vc0, vc1, vc2), 0);
-    /* Reduce 128 bits to 32 bits, and multiply by x^32. */
-    crc0 = __crc32cd(0, vgetq_lane_u64(x0, 0));
-    crc0 = __crc32cd(crc0, vc ^ vgetq_lane_u64(x0, 1));
-    buf = buf2;
-    len = end - buf;
-  }
-  if (len >= 32) {
-    size_t klen = ((len - 8) / 24) * 8;
-    uint32_t crc1 = 0;
-    uint32_t crc2 = 0;
-    uint64x2_t vc0;
-    uint64x2_t vc1;
-    uint64_t vc;
-    /* Main loop. */
-    do {
-      crc0 = __crc32cd(crc0, *(const uint64_t*)buf);
-      crc1 = __crc32cd(crc1, *(const uint64_t*)(buf + klen));
-      crc2 = __crc32cd(crc2, *(const uint64_t*)(buf + klen * 2));
-      buf += 8;
-      len -= 24;
-    } while (len >= 32);
-    vc0 = crc_shift(crc0, klen * 2 + 8);
-    vc1 = crc_shift(crc1, klen + 8);
-    vc = vgetq_lane_u64(veorq_u64(vc0, vc1), 0);
-    /* Final 8 bytes. */
-    buf += klen * 2;
-    crc0 = crc2;
-    crc0 = __crc32cd(crc0, *(const uint64_t*)buf ^ vc), buf += 8;
-    len -= 8;
-  }
-  for (; len >= 8; buf += 8, len -= 8) {
-    crc0 = __crc32cd(crc0, *(const uint64_t*)buf);
-  }
-  for (; len; --len) {
-    crc0 = __crc32cb(crc0, *buf++);
-  }
-  return ~crc0;
-}
diff --git a/include/crc32_iscsi_neon_v12e_v1.c b/include/crc32_iscsi_neon_v12e_v1.c
deleted file mode 100644
index 87438b0..0000000
--- a/include/crc32_iscsi_neon_v12e_v1.c
+++ /dev/null
@@ -1,130 +0,0 @@
-/* Generated by https://github.com/corsix/fast-crc32/ using: */
-/* ./generate -i neon -p crc32c -a v12e_v1 */
-/* Modified slightly post-generation to improve function name and include build target */
-/* MIT licensed */
-
-#include <stddef.h>
-#include <stdint.h>
-#include <arm_acle.h>
-#include <arm_neon.h>
-
-#if defined(_MSC_VER)
-#define CRC_AINLINE static __forceinline
-#define CRC_ALIGN(n) __declspec(align(n))
-#else
-#define CRC_AINLINE static __inline __attribute__((always_inline))
-#define CRC_ALIGN(n) __attribute__((aligned(n)))
-#endif
-#define CRC_EXPORT extern
-
-const char *const ISCSI_TARGET = "aarch64_neon_v12e_v1";
-
-const char *get_iscsi_target() {
-    return ISCSI_TARGET;
-}
-
-CRC_AINLINE uint64x2_t clmul_lo_e(uint64x2_t a, uint64x2_t b, uint64x2_t c) {
-  uint64x2_t r;
-  __asm("pmull %0.1q, %2.1d, %3.1d\neor %0.16b, %0.16b, %1.16b\n" : "=w"(r), "+w"(c) : "w"(a), "w"(b));
-  return r;
-}
-
-CRC_AINLINE uint64x2_t clmul_hi_e(uint64x2_t a, uint64x2_t b, uint64x2_t c) {
-  uint64x2_t r;
-  __asm("pmull2 %0.1q, %2.2d, %3.2d\neor %0.16b, %0.16b, %1.16b\n" : "=w"(r), "+w"(c) : "w"(a), "w"(b));
-  return r;
-}
-
-CRC_EXPORT uint32_t crc32_iscsi_impl(uint32_t crc0, const char* buf, size_t len) {
-  crc0 = ~crc0;
-  for (; len && ((uintptr_t)buf & 7); --len) {
-    crc0 = __crc32cb(crc0, *buf++);
-  }
-  if (((uintptr_t)buf & 8) && len >= 8) {
-    crc0 = __crc32cd(crc0, *(const uint64_t*)buf);
-    buf += 8;
-    len -= 8;
-  }
-  if (len >= 192) {
-    const char* end = buf + len;
-    const char* limit = buf + len - 192;
-    /* First vector chunk. */
-    uint64x2_t x0 = vld1q_u64((const uint64_t*)buf), y0;
-    uint64x2_t x1 = vld1q_u64((const uint64_t*)(buf + 16)), y1;
-    uint64x2_t x2 = vld1q_u64((const uint64_t*)(buf + 32)), y2;
-    uint64x2_t x3 = vld1q_u64((const uint64_t*)(buf + 48)), y3;
-    uint64x2_t x4 = vld1q_u64((const uint64_t*)(buf + 64)), y4;
-    uint64x2_t x5 = vld1q_u64((const uint64_t*)(buf + 80)), y5;
-    uint64x2_t x6 = vld1q_u64((const uint64_t*)(buf + 96)), y6;
-    uint64x2_t x7 = vld1q_u64((const uint64_t*)(buf + 112)), y7;
-    uint64x2_t x8 = vld1q_u64((const uint64_t*)(buf + 128)), y8;
-    uint64x2_t x9 = vld1q_u64((const uint64_t*)(buf + 144)), y9;
-    uint64x2_t x10 = vld1q_u64((const uint64_t*)(buf + 160)), y10;
-    uint64x2_t x11 = vld1q_u64((const uint64_t*)(buf + 176)), y11;
-    uint64x2_t k;
-    { static const uint64_t CRC_ALIGN(16) k_[] = {0xa87ab8a8, 0xab7aff2a}; k = vld1q_u64(k_); }
-    x0 = veorq_u64((uint64x2_t){crc0, 0}, x0);
-    buf += 192;
-    /* Main loop. */
-    while (buf <= limit) {
-      y0 = clmul_lo_e(x0, k, vld1q_u64((const uint64_t*)buf)), x0 = clmul_hi_e(x0, k, y0);
-      y1 = clmul_lo_e(x1, k, vld1q_u64((const uint64_t*)(buf + 16))), x1 = clmul_hi_e(x1, k, y1);
-      y2 = clmul_lo_e(x2, k, vld1q_u64((const uint64_t*)(buf + 32))), x2 = clmul_hi_e(x2, k, y2);
-      y3 = clmul_lo_e(x3, k, vld1q_u64((const uint64_t*)(buf + 48))), x3 = clmul_hi_e(x3, k, y3);
-      y4 = clmul_lo_e(x4, k, vld1q_u64((const uint64_t*)(buf + 64))), x4 = clmul_hi_e(x4, k, y4);
-      y5 = clmul_lo_e(x5, k, vld1q_u64((const uint64_t*)(buf + 80))), x5 = clmul_hi_e(x5, k, y5);
-      y6 = clmul_lo_e(x6, k, vld1q_u64((const uint64_t*)(buf + 96))), x6 = clmul_hi_e(x6, k, y6);
-      y7 = clmul_lo_e(x7, k, vld1q_u64((const uint64_t*)(buf + 112))), x7 = clmul_hi_e(x7, k, y7);
-      y8 = clmul_lo_e(x8, k, vld1q_u64((const uint64_t*)(buf + 128))), x8 = clmul_hi_e(x8, k, y8);
-      y9 = clmul_lo_e(x9, k, vld1q_u64((const uint64_t*)(buf + 144))), x9 = clmul_hi_e(x9, k, y9);
-      y10 = clmul_lo_e(x10, k, vld1q_u64((const uint64_t*)(buf + 160))), x10 = clmul_hi_e(x10, k, y10);
-      y11 = clmul_lo_e(x11, k, vld1q_u64((const uint64_t*)(buf + 176))), x11 = clmul_hi_e(x11, k, y11);
-      buf += 192;
-    }
-    /* Reduce x0 ... x11 to just x0. */
-    { static const uint64_t CRC_ALIGN(16) k_[] = {0xf20c0dfe, 0x493c7d27}; k = vld1q_u64(k_); }
-    y0 = clmul_lo_e(x0, k, x1), x0 = clmul_hi_e(x0, k, y0);
-    y2 = clmul_lo_e(x2, k, x3), x2 = clmul_hi_e(x2, k, y2);
-    y4 = clmul_lo_e(x4, k, x5), x4 = clmul_hi_e(x4, k, y4);
-    y6 = clmul_lo_e(x6, k, x7), x6 = clmul_hi_e(x6, k, y6);
-    y8 = clmul_lo_e(x8, k, x9), x8 = clmul_hi_e(x8, k, y8);
-    y10 = clmul_lo_e(x10, k, x11), x10 = clmul_hi_e(x10, k, y10);
-    { static const uint64_t CRC_ALIGN(16) k_[] = {0x3da6d0cb, 0xba4fc28e}; k = vld1q_u64(k_); }
-    y0 = clmul_lo_e(x0, k, x2), x0 = clmul_hi_e(x0, k, y0);
-    y4 = clmul_lo_e(x4, k, x6), x4 = clmul_hi_e(x4, k, y4);
-    y8 = clmul_lo_e(x8, k, x10), x8 = clmul_hi_e(x8, k, y8);
-    { static const uint64_t CRC_ALIGN(16) k_[] = {0x740eef02, 0x9e4addf8}; k = vld1q_u64(k_); }
-    y0 = clmul_lo_e(x0, k, x4), x0 = clmul_hi_e(x0, k, y0);
-    x4 = x8;
-    y0 = clmul_lo_e(x0, k, x4), x0 = clmul_hi_e(x0, k, y0);
-    /* Reduce 128 bits to 32 bits, and multiply by x^32. */
-    crc0 = __crc32cd(0, vgetq_lane_u64(x0, 0));
-    crc0 = __crc32cd(crc0, vgetq_lane_u64(x0, 1));
-    len = end - buf;
-  }
-  if (len >= 16) {
-    /* First vector chunk. */
-    uint64x2_t x0 = vld1q_u64((const uint64_t*)buf), y0;
-    uint64x2_t k;
-    { static const uint64_t CRC_ALIGN(16) k_[] = {0xf20c0dfe, 0x493c7d27}; k = vld1q_u64(k_); }
-    x0 = veorq_u64((uint64x2_t){crc0, 0}, x0);
-    buf += 16;
-    len -= 16;
-    /* Main loop. */
-    while (len >= 16) {
-      y0 = clmul_lo_e(x0, k, vld1q_u64((const uint64_t*)buf)), x0 = clmul_hi_e(x0, k, y0);
-      buf += 16;
-      len -= 16;
-    }
-    /* Reduce 128 bits to 32 bits, and multiply by x^32. */
-    crc0 = __crc32cd(0, vgetq_lane_u64(x0, 0));
-    crc0 = __crc32cd(crc0, vgetq_lane_u64(x0, 1));
-  }
-  for (; len >= 8; buf += 8, len -= 8) {
-    crc0 = __crc32cd(crc0, *(const uint64_t*)buf);
-  }
-  for (; len; --len) {
-    crc0 = __crc32cb(crc0, *buf++);
-  }
-  return ~crc0;
-}
\ No newline at end of file
diff --git a/include/crc32_iscsi_neon_v3s4x2e_v2.c b/include/crc32_iscsi_neon_v3s4x2e_v2.c
deleted file mode 100644
index a855e5f..0000000
--- a/include/crc32_iscsi_neon_v3s4x2e_v2.c
+++ /dev/null
@@ -1,169 +0,0 @@
-/* Generated by https://github.com/corsix/fast-crc32/ using: */
-/* ./generate -i neon -p crc32c -a v3s4x2e_v2 */
-/* Modified slightly post-generation to improve function name and include build target */
-/* MIT licensed */
-
-#include <stddef.h>
-#include <stdint.h>
-#include <arm_acle.h>
-#include <arm_neon.h>
-
-#if defined(_MSC_VER)
-#define CRC_AINLINE static __forceinline
-#define CRC_ALIGN(n) __declspec(align(n))
-#else
-#define CRC_AINLINE static __inline __attribute__((always_inline))
-#define CRC_ALIGN(n) __attribute__((aligned(n)))
-#endif
-#define CRC_EXPORT extern
-
-const char *const ISCSI_TARGET = "aarch64_neon_v3s4x2e_v2";
-
-const char *get_iscsi_target() {
-    return ISCSI_TARGET;
-}
-
-CRC_AINLINE uint64x2_t clmul_lo_e(uint64x2_t a, uint64x2_t b, uint64x2_t c) {
-  uint64x2_t r;
-  __asm("pmull %0.1q, %2.1d, %3.1d\neor %0.16b, %0.16b, %1.16b\n" : "=w"(r), "+w"(c) : "w"(a), "w"(b));
-  return r;
-}
-
-CRC_AINLINE uint64x2_t clmul_hi_e(uint64x2_t a, uint64x2_t b, uint64x2_t c) {
-  uint64x2_t r;
-  __asm("pmull2 %0.1q, %2.2d, %3.2d\neor %0.16b, %0.16b, %1.16b\n" : "=w"(r), "+w"(c) : "w"(a), "w"(b));
-  return r;
-}
-
-CRC_AINLINE uint64x2_t clmul_scalar(uint32_t a, uint32_t b) {
-  uint64x2_t r;
-  __asm("pmull %0.1q, %1.1d, %2.1d\n" : "=w"(r) : "w"(vmovq_n_u64(a)), "w"(vmovq_n_u64(b)));
-  return r;
-}
-
-static uint32_t xnmodp(uint64_t n) /* x^n mod P, in log(n) time */ {
-  uint64_t stack = ~(uint64_t)1;
-  uint32_t acc, low;
-  for (; n > 191; n = (n >> 1) - 16) {
-    stack = (stack << 1) + (n & 1);
-  }
-  stack = ~stack;
-  acc = ((uint32_t)0x80000000) >> (n & 31);
-  for (n >>= 5; n; --n) {
-    acc = __crc32cw(acc, 0);
-  }
-  while ((low = stack & 1), stack >>= 1) {
-    poly8x8_t x = vreinterpret_p8_u64(vmov_n_u64(acc));
-    uint64_t y = vgetq_lane_u64(vreinterpretq_u64_p16(vmull_p8(x, x)), 0);
-    acc = __crc32cd(0, y << low);
-  }
-  return acc;
-}
-
-CRC_AINLINE uint64x2_t crc_shift(uint32_t crc, size_t nbytes) {
-  return clmul_scalar(crc, xnmodp(nbytes * 8 - 33));
-}
-
-CRC_EXPORT uint32_t crc32_iscsi_impl(uint32_t crc0, const char* buf, size_t len) {
-  crc0 = ~crc0;
-  for (; len && ((uintptr_t)buf & 7); --len) {
-    crc0 = __crc32cb(crc0, *buf++);
-  }
-  if (((uintptr_t)buf & 8) && len >= 8) {
-    crc0 = __crc32cd(crc0, *(const uint64_t*)buf);
-    buf += 8;
-    len -= 8;
-  }
-  if (len >= 112) {
-    const char* end = buf + len;
-    size_t blk = (len - 0) / 112;
-    size_t klen = blk * 16;
-    const char* buf2 = buf + klen * 4;
-    const char* limit = buf + klen - 32;
-    uint32_t crc1 = 0;
-    uint32_t crc2 = 0;
-    uint32_t crc3 = 0;
-    uint64x2_t vc0;
-    uint64x2_t vc1;
-    uint64x2_t vc2;
-    uint64x2_t vc3;
-    uint64_t vc;
-    /* First vector chunk. */
-    uint64x2_t x0 = vld1q_u64((const uint64_t*)buf2), y0;
-    uint64x2_t x1 = vld1q_u64((const uint64_t*)(buf2 + 16)), y1;
-    uint64x2_t x2 = vld1q_u64((const uint64_t*)(buf2 + 32)), y2;
-    uint64x2_t k;
-    { static const uint64_t CRC_ALIGN(16) k_[] = {0x1c291d04, 0xddc0152b}; k = vld1q_u64(k_); }
-    buf2 += 48;
-    /* Main loop. */
-    while (buf <= limit) {
-      y0 = clmul_lo_e(x0, k, vld1q_u64((const uint64_t*)buf2)), x0 = clmul_hi_e(x0, k, y0);
-      y1 = clmul_lo_e(x1, k, vld1q_u64((const uint64_t*)(buf2 + 16))), x1 = clmul_hi_e(x1, k, y1);
-      y2 = clmul_lo_e(x2, k, vld1q_u64((const uint64_t*)(buf2 + 32))), x2 = clmul_hi_e(x2, k, y2);
-      crc0 = __crc32cd(crc0, *(const uint64_t*)buf);
-      crc1 = __crc32cd(crc1, *(const uint64_t*)(buf + klen));
-      crc2 = __crc32cd(crc2, *(const uint64_t*)(buf + klen * 2));
-      crc3 = __crc32cd(crc3, *(const uint64_t*)(buf + klen * 3));
-      crc0 = __crc32cd(crc0, *(const uint64_t*)(buf + 8));
-      crc1 = __crc32cd(crc1, *(const uint64_t*)(buf + klen + 8));
-      crc2 = __crc32cd(crc2, *(const uint64_t*)(buf + klen * 2 + 8));
-      crc3 = __crc32cd(crc3, *(const uint64_t*)(buf + klen * 3 + 8));
-      buf += 16;
-      buf2 += 48;
-    }
-    /* Reduce x0 ... x2 to just x0. */
-    { static const uint64_t CRC_ALIGN(16) k_[] = {0xf20c0dfe, 0x493c7d27}; k = vld1q_u64(k_); }
-    y0 = clmul_lo_e(x0, k, x1), x0 = clmul_hi_e(x0, k, y0);
-    x1 = x2;
-    y0 = clmul_lo_e(x0, k, x1), x0 = clmul_hi_e(x0, k, y0);
-    /* Final scalar chunk. */
-    crc0 = __crc32cd(crc0, *(const uint64_t*)buf);
-    crc1 = __crc32cd(crc1, *(const uint64_t*)(buf + klen));
-    crc2 = __crc32cd(crc2, *(const uint64_t*)(buf + klen * 2));
-    crc3 = __crc32cd(crc3, *(const uint64_t*)(buf + klen * 3));
-    crc0 = __crc32cd(crc0, *(const uint64_t*)(buf + 8));
-    crc1 = __crc32cd(crc1, *(const uint64_t*)(buf + klen + 8));
-    crc2 = __crc32cd(crc2, *(const uint64_t*)(buf + klen * 2 + 8));
-    crc3 = __crc32cd(crc3, *(const uint64_t*)(buf + klen * 3 + 8));
-    vc0 = crc_shift(crc0, klen * 3 + blk * 48);
-    vc1 = crc_shift(crc1, klen * 2 + blk * 48);
-    vc2 = crc_shift(crc2, klen + blk * 48);
-    vc3 = crc_shift(crc3, 0 + blk * 48);
-    vc = vgetq_lane_u64(veorq_u64(veorq_u64(vc0, vc1), veorq_u64(vc2, vc3)), 0);
-    /* Reduce 128 bits to 32 bits, and multiply by x^32. */
-    crc0 = __crc32cd(0, vgetq_lane_u64(x0, 0));
-    crc0 = __crc32cd(crc0, vc ^ vgetq_lane_u64(x0, 1));
-    buf = buf2;
-    len = end - buf;
-  }
-  if (len >= 32) {
-    /* First vector chunk. */
-    uint64x2_t x0 = vld1q_u64((const uint64_t*)buf), y0;
-    uint64x2_t x1 = vld1q_u64((const uint64_t*)(buf + 16)), y1;
-    uint64x2_t k;
-    { static const uint64_t CRC_ALIGN(16) k_[] = {0x3da6d0cb, 0xba4fc28e}; k = vld1q_u64(k_); }
-    x0 = veorq_u64((uint64x2_t){crc0, 0}, x0);
-    buf += 32;
-    len -= 32;
-    /* Main loop. */
-    while (len >= 32) {
-      y0 = clmul_lo_e(x0, k, vld1q_u64((const uint64_t*)buf)), x0 = clmul_hi_e(x0, k, y0);
-      y1 = clmul_lo_e(x1, k, vld1q_u64((const uint64_t*)(buf + 16))), x1 = clmul_hi_e(x1, k, y1);
-      buf += 32;
-      len -= 32;
-    }
-    /* Reduce x0 ... x1 to just x0. */
-    { static const uint64_t CRC_ALIGN(16) k_[] = {0xf20c0dfe, 0x493c7d27}; k = vld1q_u64(k_); }
-    y0 = clmul_lo_e(x0, k, x1), x0 = clmul_hi_e(x0, k, y0);
-    /* Reduce 128 bits to 32 bits, and multiply by x^32. */
-    crc0 = __crc32cd(0, vgetq_lane_u64(x0, 0));
-    crc0 = __crc32cd(crc0, vgetq_lane_u64(x0, 1));
-  }
-  for (; len >= 8; buf += 8, len -= 8) {
-    crc0 = __crc32cd(crc0, *(const uint64_t*)buf);
-  }
-  for (; len; --len) {
-    crc0 = __crc32cb(crc0, *buf++);
-  }
-  return ~crc0;
-}
diff --git a/include/crc32_iscsi_sse_v4s3x3.c b/include/crc32_iscsi_sse_v4s3x3.c
deleted file mode 100644
index 63725ea..0000000
--- a/include/crc32_iscsi_sse_v4s3x3.c
+++ /dev/null
@@ -1,223 +0,0 @@
-/* Generated by https://github.com/corsix/fast-crc32/ using: */
-/* ./generate -i sse -p crc32c -a v4s3x3 */
-/* Modified slightly post-generation to improve function name and include build target */
-/* MIT licensed */
-/* Modified for 32-bit compatibility */
-
-#include <stddef.h>
-#include <stdint.h>
-#include <nmmintrin.h>
-#include <wmmintrin.h>
-
-#if defined(_MSC_VER)
-#define CRC_AINLINE static __forceinline
-#define CRC_ALIGN(n) __declspec(align(n))
-#else
-#define CRC_AINLINE static __inline __attribute__((always_inline))
-#define CRC_ALIGN(n) __attribute__((aligned(n)))
-#endif
-#define CRC_EXPORT extern
-
-#define clmul_lo(a, b) (_mm_clmulepi64_si128((a), (b), 0))
-#define clmul_hi(a, b) (_mm_clmulepi64_si128((a), (b), 17))
-
-const char *const ISCSI_TARGET = "x86_sse_v4s3x3";
-
-const char *get_iscsi_target() {
-    return ISCSI_TARGET;
-}
-
-/* Platform-specific 64-bit handling */
-#if defined(__x86_64__) || defined(_M_X64)
-/* 64-bit platform */
-CRC_AINLINE __m128i mm_cvtsi64_si128(uint64_t val) {
-  return _mm_cvtsi64_si128(val);
-}
-
-CRC_AINLINE uint64_t mm_cvtsi128_si64(__m128i val) {
-  return _mm_cvtsi128_si64(val);
-}
-
-CRC_AINLINE uint64_t mm_extract_epi64(__m128i val, int idx) {
-  /* Even on 64-bit platforms, we need to use constant indices */
-  if (idx == 0) {
-    return _mm_cvtsi128_si64(val);
-  } else {
-    /* For the high 64 bits */
-    return _mm_cvtsi128_si64(_mm_srli_si128(val, 8));
-  }
-}
-
-CRC_AINLINE uint32_t mm_crc32_u64(uint32_t crc, uint64_t val) {
-  return _mm_crc32_u64(crc, val);
-}
-#else
-/* 32-bit platform */
-CRC_AINLINE __m128i mm_cvtsi64_si128(uint64_t val) {
-  /* Split 64-bit value into two 32-bit parts for 32-bit platform */
-  __m128i result, temp;
-  result = _mm_cvtsi32_si128((uint32_t)val);  /* Low 32 bits */
-  temp = _mm_cvtsi32_si128((uint32_t)(val >> 32));  /* High 32 bits */
-
-  /* Shift high 32 bits to position 1 */
-  temp = _mm_slli_si128(temp, 4);
-
-  /* Combine low and high parts */
-  result = _mm_or_si128(result, temp);
-  return result;
-}
-
-CRC_AINLINE uint64_t mm_cvtsi128_si64(__m128i val) {
-  /* Combine two 32-bit values into one 64-bit result */
-  uint32_t low = _mm_cvtsi128_si32(val);
-  uint32_t high = _mm_extract_epi32(val, 1);
-  return ((uint64_t)high << 32) | low;
-}
-
-CRC_AINLINE uint64_t mm_extract_epi64(__m128i val, int idx) {
-  /* Extract 64 bits (two 32-bit values) */
-  uint32_t low, high;
-
-  if (idx == 0) {
-    low = _mm_cvtsi128_si32(val);
-    high = _mm_extract_epi32(val, 1);
-  } else {
-    low = _mm_extract_epi32(val, 2);
-    high = _mm_extract_epi32(val, 3);
-  }
-
-  return ((uint64_t)high << 32) | low;
-}
-
-CRC_AINLINE uint32_t mm_crc32_u64(uint32_t crc, uint64_t val) {
-  /* Process 64-bit value in two 32-bit chunks on 32-bit platforms */
-  crc = _mm_crc32_u32(crc, (uint32_t)val);
-  crc = _mm_crc32_u32(crc, (uint32_t)(val >> 32));
-  return crc;
-}
-#endif
-
-CRC_AINLINE __m128i clmul_scalar(uint32_t a, uint32_t b) {
-  return _mm_clmulepi64_si128(_mm_cvtsi32_si128(a), _mm_cvtsi32_si128(b), 0);
-}
-
-static uint32_t xnmodp(uint64_t n) /* x^n mod P, in log(n) time */ {
-  uint64_t stack = ~(uint64_t)1;
-  uint32_t acc, low;
-  for (; n > 191; n = (n >> 1) - 16) {
-    stack = (stack << 1) + (n & 1);
-  }
-  stack = ~stack;
-  acc = ((uint32_t)0x80000000) >> (n & 31);
-  for (n >>= 5; n; --n) {
-    acc = _mm_crc32_u32(acc, 0);
-  }
-  while ((low = stack & 1), stack >>= 1) {
-    __m128i x = _mm_cvtsi32_si128(acc);
-    uint64_t y = mm_cvtsi128_si64(_mm_clmulepi64_si128(x, x, 0));
-    acc = mm_crc32_u64(0, y << low);
-  }
-  return acc;
-}
-
-CRC_AINLINE __m128i crc_shift(uint32_t crc, size_t nbytes) {
-  return clmul_scalar(crc, xnmodp(nbytes * 8 - 33));
-}
-
-CRC_EXPORT uint32_t crc32_iscsi_impl(uint32_t crc0, const char* buf, size_t len) {
-  crc0 = ~crc0;
-  for (; len && ((uintptr_t)buf & 7); --len) {
-    crc0 = _mm_crc32_u8(crc0, *buf++);
-  }
-  if (((uintptr_t)buf & 8) && len >= 8) {
-    crc0 = mm_crc32_u64(crc0, *(const uint64_t*)buf);
-    buf += 8;
-    len -= 8;
-  }
-  if (len >= 144) {
-    size_t blk = (len - 8) / 136;
-    size_t klen = blk * 24;
-    const char* buf2 = buf + 0;
-    uint32_t crc1 = 0;
-    uint32_t crc2 = 0;
-    __m128i vc0;
-    __m128i vc1;
-    uint64_t vc;
-    /* First vector chunk. */
-    __m128i x0 = _mm_loadu_si128((const __m128i*)buf2), y0;
-    __m128i x1 = _mm_loadu_si128((const __m128i*)(buf2 + 16)), y1;
-    __m128i x2 = _mm_loadu_si128((const __m128i*)(buf2 + 32)), y2;
-    __m128i x3 = _mm_loadu_si128((const __m128i*)(buf2 + 48)), y3;
-    __m128i k;
-    k = _mm_setr_epi32(0x740eef02, 0, 0x9e4addf8, 0);
-    x0 = _mm_xor_si128(_mm_cvtsi32_si128(crc0), x0);
-    crc0 = 0;
-    buf2 += 64;
-    len -= 136;
-    buf += blk * 64;
-    /* Main loop. */
-    while (len >= 144) {
-      y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
-      y1 = clmul_lo(x1, k), x1 = clmul_hi(x1, k);
-      y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k);
-      y3 = clmul_lo(x3, k), x3 = clmul_hi(x3, k);
-      y0 = _mm_xor_si128(y0, _mm_loadu_si128((const __m128i*)buf2)), x0 = _mm_xor_si128(x0, y0);
-      y1 = _mm_xor_si128(y1, _mm_loadu_si128((const __m128i*)(buf2 + 16))), x1 = _mm_xor_si128(x1, y1);
-      y2 = _mm_xor_si128(y2, _mm_loadu_si128((const __m128i*)(buf2 + 32))), x2 = _mm_xor_si128(x2, y2);
-      y3 = _mm_xor_si128(y3, _mm_loadu_si128((const __m128i*)(buf2 + 48))), x3 = _mm_xor_si128(x3, y3);
-      crc0 = mm_crc32_u64(crc0, *(const uint64_t*)buf);
-      crc1 = mm_crc32_u64(crc1, *(const uint64_t*)(buf + klen));
-      crc2 = mm_crc32_u64(crc2, *(const uint64_t*)(buf + klen * 2));
-      crc0 = mm_crc32_u64(crc0, *(const uint64_t*)(buf + 8));
-      crc1 = mm_crc32_u64(crc1, *(const uint64_t*)(buf + klen + 8));
-      crc2 = mm_crc32_u64(crc2, *(const uint64_t*)(buf + klen * 2 + 8));
-      crc0 = mm_crc32_u64(crc0, *(const uint64_t*)(buf + 16));
-      crc1 = mm_crc32_u64(crc1, *(const uint64_t*)(buf + klen + 16));
-      crc2 = mm_crc32_u64(crc2, *(const uint64_t*)(buf + klen * 2 + 16));
-      buf += 24;
-      buf2 += 64;
-      len -= 136;
-    }
-    /* Reduce x0 ... x3 to just x0. */
-    k = _mm_setr_epi32(0xf20c0dfe, 0, 0x493c7d27, 0);
-    y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
-    y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k);
-    y0 = _mm_xor_si128(y0, x1), x0 = _mm_xor_si128(x0, y0);
-    y2 = _mm_xor_si128(y2, x3), x2 = _mm_xor_si128(x2, y2);
-    k = _mm_setr_epi32(0x3da6d0cb, 0, 0xba4fc28e, 0);
-    y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
-    y0 = _mm_xor_si128(y0, x2), x0 = _mm_xor_si128(x0, y0);
-    /* Final scalar chunk. */
-    crc0 = mm_crc32_u64(crc0, *(const uint64_t*)buf);
-    crc1 = mm_crc32_u64(crc1, *(const uint64_t*)(buf + klen));
-    crc2 = mm_crc32_u64(crc2, *(const uint64_t*)(buf + klen * 2));
-    crc0 = mm_crc32_u64(crc0, *(const uint64_t*)(buf + 8));
-    crc1 = mm_crc32_u64(crc1, *(const uint64_t*)(buf + klen + 8));
-    crc2 = mm_crc32_u64(crc2, *(const uint64_t*)(buf + klen * 2 + 8));
-    crc0 = mm_crc32_u64(crc0, *(const uint64_t*)(buf + 16));
-    crc1 = mm_crc32_u64(crc1, *(const uint64_t*)(buf + klen + 16));
-    crc2 = mm_crc32_u64(crc2, *(const uint64_t*)(buf + klen * 2 + 16));
-    buf += 24;
-    vc0 = crc_shift(crc0, klen * 2 + 8);
-    vc1 = crc_shift(crc1, klen + 8);
-    vc = mm_extract_epi64(_mm_xor_si128(vc0, vc1), 0);
-    /* Reduce 128 bits to 32 bits, and multiply by x^32. */
-    /* Extract the two 64-bit parts of x0 and combine them */
-    uint64_t x0_low = mm_extract_epi64(x0, 0);
-    uint64_t x0_high = mm_extract_epi64(x0, 1);
-    uint64_t x0_combined = mm_extract_epi64(crc_shift(mm_crc32_u64(mm_crc32_u64(0, x0_low), x0_high), klen * 3 + 8), 0);
-    vc ^= x0_combined;
-    /* Final 8 bytes. */
-    buf += klen * 2;
-    crc0 = crc2;
-    crc0 = mm_crc32_u64(crc0, *(const uint64_t*)buf ^ vc), buf += 8;
-    len -= 8;
-  }
-  for (; len >= 8; buf += 8, len -= 8) {
-    crc0 = mm_crc32_u64(crc0, *(const uint64_t*)buf);
-  }
-  for (; len; --len) {
-    crc0 = _mm_crc32_u8(crc0, *buf++);
-  }
-  return ~crc0;
-}
\ No newline at end of file
diff --git a/include/crc32_iso_hdlc.h b/include/crc32_iso_hdlc.h
deleted file mode 100644
index d5b990c..0000000
--- a/include/crc32_iso_hdlc.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/* Generated header for hardware-accelerated CRC-32/ISO_HDLC implementation */
-/* Original implementation from https://github.com/corsix/fast-crc32/ */
-/* MIT licensed */
-
-#ifndef CRC32_ISO_HDLC_H
-#define CRC32_ISO_HDLC_H
-
-#include <stddef.h>
-#include <stdint.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*
- * The target build properties (CPU architecture and fine-tuning parameters) for the compiled implementation.
- */
-extern const char *const ISO_HDLC_TARGET;
-
-/**
- * Gets the target build properties (CPU architecture and fine-tuning parameters) for this implementation.
- */
-const char *get_iso_hdlc_target(void);
-
-/**
- * Calculate CRC-32/ISO_HDLC checksum using hardware acceleration
- *
- * @param crc0 Initial CRC value (typically 0)
- * @param buf Pointer to input data buffer
- * @param len Length of input data in bytes
- *
- * @return Calculated CRC-32/ISO_HDLC checksum
- */
-uint32_t crc32_iso_hdlc_impl(uint32_t crc0, const char* buf, size_t len);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* CRC32_ISO_HDLC_H */
\ No newline at end of file
diff --git a/include/crc32_iso_hdlc_avx512_v4s3x3.c b/include/crc32_iso_hdlc_avx512_v4s3x3.c
deleted file mode 100644
index 7e8624e..0000000
--- a/include/crc32_iso_hdlc_avx512_v4s3x3.c
+++ /dev/null
@@ -1,215 +0,0 @@
-/* Generated by https://github.com/corsix/fast-crc32/ using: */
-/* ./generate -i avx512 -p crc32 -a v4s3x3 */
-/* Modified slightly post-generation to improve function name and include build target */
-/* MIT licensed */
-
-#include <stddef.h>
-#include <stdint.h>
-#include <nmmintrin.h>
-#include <wmmintrin.h>
-#include <immintrin.h>
-
-#if defined(_MSC_VER)
-#define CRC_AINLINE static __forceinline
-#define CRC_ALIGN(n) __declspec(align(n))
-#else
-#define CRC_AINLINE static __inline __attribute__((always_inline))
-#define CRC_ALIGN(n) __attribute__((aligned(n)))
-#endif
-#define CRC_EXPORT extern
-
-static const uint32_t g_crc_table[1][256] = {{
-  0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x076dc419, 0x706af48f,
-  0xe963a535, 0x9e6495a3, 0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988,
-  0x09b64c2b, 0x7eb17cbd, 0xe7b82d07, 0x90bf1d91, 0x1db71064, 0x6ab020f2,
-  0xf3b97148, 0x84be41de, 0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7,
-  0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec, 0x14015c4f, 0x63066cd9,
-  0xfa0f3d63, 0x8d080df5, 0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172,
-  0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b, 0x35b5a8fa, 0x42b2986c,
-  0xdbbbc9d6, 0xacbcf940, 0x32d86ce3, 0x45df5c75, 0xdcd60dcf, 0xabd13d59,
-  0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116, 0x21b4f4b5, 0x56b3c423,
-  0xcfba9599, 0xb8bda50f, 0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924,
-  0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d, 0x76dc4190, 0x01db7106,
-  0x98d220bc, 0xefd5102a, 0x71b18589, 0x06b6b51f, 0x9fbfe4a5, 0xe8b8d433,
-  0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818, 0x7f6a0dbb, 0x086d3d2d,
-  0x91646c97, 0xe6635c01, 0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e,
-  0x6c0695ed, 0x1b01a57b, 0x8208f4c1, 0xf50fc457, 0x65b0d9c6, 0x12b7e950,
-  0x8bbeb8ea, 0xfcb9887c, 0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, 0xfbd44c65,
-  0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2, 0x4adfa541, 0x3dd895d7,
-  0xa4d1c46d, 0xd3d6f4fb, 0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0,
-  0x44042d73, 0x33031de5, 0xaa0a4c5f, 0xdd0d7cc9, 0x5005713c, 0x270241aa,
-  0xbe0b1010, 0xc90c2086, 0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f,
-  0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, 0x59b33d17, 0x2eb40d81,
-  0xb7bd5c3b, 0xc0ba6cad, 0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a,
-  0xead54739, 0x9dd277af, 0x04db2615, 0x73dc1683, 0xe3630b12, 0x94643b84,
-  0x0d6d6a3e, 0x7a6a5aa8, 0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1,
-  0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe, 0xf762575d, 0x806567cb,
-  0x196c3671, 0x6e6b06e7, 0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc,
-  0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5, 0xd6d6a3e8, 0xa1d1937e,
-  0x38d8c2c4, 0x4fdff252, 0xd1bb67f1, 0xa6bc5767, 0x3fb506dd, 0x48b2364b,
-  0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60, 0xdf60efc3, 0xa867df55,
-  0x316e8eef, 0x4669be79, 0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236,
-  0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f, 0xc5ba3bbe, 0xb2bd0b28,
-  0x2bb45a92, 0x5cb36a04, 0xc2d7ffa7, 0xb5d0cf31, 0x2cd99e8b, 0x5bdeae1d,
-  0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a, 0x9c0906a9, 0xeb0e363f,
-  0x72076785, 0x05005713, 0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38,
-  0x92d28e9b, 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21, 0x86d3d2d4, 0xf1d4e242,
-  0x68ddb3f8, 0x1fda836e, 0x81be16cd, 0xf6b9265b, 0x6fb077e1, 0x18b74777,
-  0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c, 0x8f659eff, 0xf862ae69,
-  0x616bffd3, 0x166ccf45, 0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2,
-  0xa7672661, 0xd06016f7, 0x4969474d, 0x3e6e77db, 0xaed16a4a, 0xd9d65adc,
-  0x40df0b66, 0x37d83bf0, 0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9,
-  0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, 0xbad03605, 0xcdd70693,
-  0x54de5729, 0x23d967bf, 0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94,
-  0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d
-}};
-
-const char *const ISO_HDLC_TARGET = "x86_64_avx512_v4s3x3";
-
-const char *get_iso_hdlc_target() {
-    return ISO_HDLC_TARGET;
-}
-
-CRC_AINLINE uint32_t crc_u8(uint32_t crc, uint8_t val) {
-  return (crc >> 8) ^ g_crc_table[0][(crc & 0xFF) ^ val];
-}
-
-CRC_AINLINE uint32_t crc_u64(uint32_t crc, uint64_t val) {
-  __m128i k = _mm_setr_epi32(0xf7011641, 0xb4e5b025, 0xdb710641, 1);
-  __m128i a = _mm_cvtsi64_si128(crc ^ val);
-  __m128i b = _mm_clmulepi64_si128(a, k, 0x00);
-  __m128i c = _mm_clmulepi64_si128(b, k, 0x10);
-  return _mm_extract_epi32(c, 2);
-}
-
-#define clmul_lo(a, b) (_mm_clmulepi64_si128((a), (b), 0))
-#define clmul_hi(a, b) (_mm_clmulepi64_si128((a), (b), 17))
-
-CRC_AINLINE __m128i clmul_scalar(uint32_t a, uint32_t b) {
-  return _mm_clmulepi64_si128(_mm_cvtsi32_si128(a), _mm_cvtsi32_si128(b), 0);
-}
-
-CRC_AINLINE uint32_t crc_u32(uint32_t crc, uint32_t val) {
-  __m128i k = _mm_setr_epi32(0x00000000, 0xf7011641, 0xdb710641, 1);
-  __m128i a = _mm_cvtsi32_si128(crc ^ val);
-  __m128i b = _mm_clmulepi64_si128(a, k, 0x00);
-  __m128i c = _mm_clmulepi64_si128(b, k, 0x10);
-  return _mm_extract_epi32(c, 2);
-}
-
-static uint32_t xnmodp(uint64_t n) /* x^n mod P, in log(n) time */ {
-  uint64_t stack = ~(uint64_t)1;
-  uint32_t acc, low;
-  for (; n > 191; n = (n >> 1) - 16) {
-    stack = (stack << 1) + (n & 1);
-  }
-  stack = ~stack;
-  acc = ((uint32_t)0x80000000) >> (n & 31);
-  for (n >>= 5; n; --n) {
-    acc = crc_u32(acc, 0);
-  }
-  while ((low = stack & 1), stack >>= 1) {
-    __m128i x = _mm_cvtsi32_si128(acc);
-    uint64_t y = _mm_cvtsi128_si64(_mm_clmulepi64_si128(x, x, 0));
-    acc = crc_u64(0, y << low);
-  }
-  return acc;
-}
-
-CRC_AINLINE __m128i crc_shift(uint32_t crc, size_t nbytes) {
-  return clmul_scalar(crc, xnmodp(nbytes * 8 - 33));
-}
-
-CRC_EXPORT uint32_t crc32_iso_hdlc_impl(uint32_t crc0, const char* buf, size_t len) {
-  crc0 = ~crc0;
-  for (; len && ((uintptr_t)buf & 7); --len) {
-    crc0 = crc_u8(crc0, *buf++);
-  }
-  if (((uintptr_t)buf & 8) && len >= 8) {
-    crc0 = crc_u64(crc0, *(const uint64_t*)buf);
-    buf += 8;
-    len -= 8;
-  }
-  if (len >= 144) {
-    size_t blk = (len - 8) / 136;
-    size_t klen = blk * 24;
-    const char* buf2 = buf + 0;
-    uint32_t crc1 = 0;
-    uint32_t crc2 = 0;
-    __m128i vc0;
-    __m128i vc1;
-    uint64_t vc;
-    /* First vector chunk. */
-    __m128i x0 = _mm_loadu_si128((const __m128i*)buf2), y0;
-    __m128i x1 = _mm_loadu_si128((const __m128i*)(buf2 + 16)), y1;
-    __m128i x2 = _mm_loadu_si128((const __m128i*)(buf2 + 32)), y2;
-    __m128i x3 = _mm_loadu_si128((const __m128i*)(buf2 + 48)), y3;
-    __m128i k;
-    k = _mm_setr_epi32(0x8f352d95, 0, 0x1d9513d7, 0);
-    x0 = _mm_xor_si128(_mm_cvtsi32_si128(crc0), x0);
-    crc0 = 0;
-    buf2 += 64;
-    len -= 136;
-    buf += blk * 64;
-    /* Main loop. */
-    while (len >= 144) {
-      y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
-      y1 = clmul_lo(x1, k), x1 = clmul_hi(x1, k);
-      y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k);
-      y3 = clmul_lo(x3, k), x3 = clmul_hi(x3, k);
-      x0 = _mm_ternarylogic_epi64(x0, y0, _mm_loadu_si128((const __m128i*)buf2), 0x96);
-      x1 = _mm_ternarylogic_epi64(x1, y1, _mm_loadu_si128((const __m128i*)(buf2 + 16)), 0x96);
-      x2 = _mm_ternarylogic_epi64(x2, y2, _mm_loadu_si128((const __m128i*)(buf2 + 32)), 0x96);
-      x3 = _mm_ternarylogic_epi64(x3, y3, _mm_loadu_si128((const __m128i*)(buf2 + 48)), 0x96);
-      crc0 = crc_u64(crc0, *(const uint64_t*)buf);
-      crc1 = crc_u64(crc1, *(const uint64_t*)(buf + klen));
-      crc2 = crc_u64(crc2, *(const uint64_t*)(buf + klen * 2));
-      crc0 = crc_u64(crc0, *(const uint64_t*)(buf + 8));
-      crc1 = crc_u64(crc1, *(const uint64_t*)(buf + klen + 8));
-      crc2 = crc_u64(crc2, *(const uint64_t*)(buf + klen * 2 + 8));
-      crc0 = crc_u64(crc0, *(const uint64_t*)(buf + 16));
-      crc1 = crc_u64(crc1, *(const uint64_t*)(buf + klen + 16));
-      crc2 = crc_u64(crc2, *(const uint64_t*)(buf + klen * 2 + 16));
-      buf += 24;
-      buf2 += 64;
-      len -= 136;
-    }
-    /* Reduce x0 ... x3 to just x0. */
-    k = _mm_setr_epi32(0xae689191, 0, 0xccaa009e, 0);
-    y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
-    y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k);
-    x0 = _mm_ternarylogic_epi64(x0, y0, x1, 0x96);
-    x2 = _mm_ternarylogic_epi64(x2, y2, x3, 0x96);
-    k = _mm_setr_epi32(0xf1da05aa, 0, 0x81256527, 0);
-    y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
-    x0 = _mm_ternarylogic_epi64(x0, y0, x2, 0x96);
-    /* Final scalar chunk. */
-    crc0 = crc_u64(crc0, *(const uint64_t*)buf);
-    crc1 = crc_u64(crc1, *(const uint64_t*)(buf + klen));
-    crc2 = crc_u64(crc2, *(const uint64_t*)(buf + klen * 2));
-    crc0 = crc_u64(crc0, *(const uint64_t*)(buf + 8));
-    crc1 = crc_u64(crc1, *(const uint64_t*)(buf + klen + 8));
-    crc2 = crc_u64(crc2, *(const uint64_t*)(buf + klen * 2 + 8));
-    crc0 = crc_u64(crc0, *(const uint64_t*)(buf + 16));
-    crc1 = crc_u64(crc1, *(const uint64_t*)(buf + klen + 16));
-    crc2 = crc_u64(crc2, *(const uint64_t*)(buf + klen * 2 + 16));
-    buf += 24;
-    vc0 = crc_shift(crc0, klen * 2 + 8);
-    vc1 = crc_shift(crc1, klen + 8);
-    vc = _mm_extract_epi64(_mm_xor_si128(vc0, vc1), 0);
-    /* Reduce 128 bits to 32 bits, and multiply by x^32. */
-    vc ^= _mm_extract_epi64(crc_shift(crc_u64(crc_u64(0, _mm_extract_epi64(x0, 0)), _mm_extract_epi64(x0, 1)), klen * 3 + 8), 0);
-    /* Final 8 bytes. */
-    buf += klen * 2;
-    crc0 = crc2;
-    crc0 = crc_u64(crc0, *(const uint64_t*)buf ^ vc), buf += 8;
-    len -= 8;
-  }
-  for (; len >= 8; buf += 8, len -= 8) {
-    crc0 = crc_u64(crc0, *(const uint64_t*)buf);
-  }
-  for (; len; --len) {
-    crc0 = crc_u8(crc0, *buf++);
-  }
-  return ~crc0;
-}
diff --git a/include/crc32_iso_hdlc_avx512_vpclmulqdq_v3x2.c b/include/crc32_iso_hdlc_avx512_vpclmulqdq_v3x2.c
deleted file mode 100644
index e3836bb..0000000
--- a/include/crc32_iso_hdlc_avx512_vpclmulqdq_v3x2.c
+++ /dev/null
@@ -1,156 +0,0 @@
-/* Generated by https://github.com/corsix/fast-crc32/ using: */
-/* ./generate -i avx512_vpclmulqdq -p crc32 -a v3x2 */
-/* Modified slightly post-generation to improve function name and include build target */
-/* MIT licensed */
-
-#include <stddef.h>
-#include <stdint.h>
-#include <nmmintrin.h>
-#include <wmmintrin.h>
-#include <immintrin.h>
-
-#if defined(_MSC_VER)
-#define CRC_AINLINE static __forceinline
-#define CRC_ALIGN(n) __declspec(align(n))
-#else
-#define CRC_AINLINE static __inline __attribute__((always_inline))
-#define CRC_ALIGN(n) __attribute__((aligned(n)))
-#endif
-#define CRC_EXPORT extern
-
-static const uint32_t g_crc_table[1][256] = {{
-  0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x076dc419, 0x706af48f,
-  0xe963a535, 0x9e6495a3, 0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988,
-  0x09b64c2b, 0x7eb17cbd, 0xe7b82d07, 0x90bf1d91, 0x1db71064, 0x6ab020f2,
-  0xf3b97148, 0x84be41de, 0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7,
-  0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec, 0x14015c4f, 0x63066cd9,
-  0xfa0f3d63, 0x8d080df5, 0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172,
-  0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b, 0x35b5a8fa, 0x42b2986c,
-  0xdbbbc9d6, 0xacbcf940, 0x32d86ce3, 0x45df5c75, 0xdcd60dcf, 0xabd13d59,
-  0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116, 0x21b4f4b5, 0x56b3c423,
-  0xcfba9599, 0xb8bda50f, 0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924,
-  0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d, 0x76dc4190, 0x01db7106,
-  0x98d220bc, 0xefd5102a, 0x71b18589, 0x06b6b51f, 0x9fbfe4a5, 0xe8b8d433,
-  0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818, 0x7f6a0dbb, 0x086d3d2d,
-  0x91646c97, 0xe6635c01, 0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e,
-  0x6c0695ed, 0x1b01a57b, 0x8208f4c1, 0xf50fc457, 0x65b0d9c6, 0x12b7e950,
-  0x8bbeb8ea, 0xfcb9887c, 0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, 0xfbd44c65,
-  0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2, 0x4adfa541, 0x3dd895d7,
-  0xa4d1c46d, 0xd3d6f4fb, 0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0,
-  0x44042d73, 0x33031de5, 0xaa0a4c5f, 0xdd0d7cc9, 0x5005713c, 0x270241aa,
-  0xbe0b1010, 0xc90c2086, 0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f,
-  0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, 0x59b33d17, 0x2eb40d81,
-  0xb7bd5c3b, 0xc0ba6cad, 0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a,
-  0xead54739, 0x9dd277af, 0x04db2615, 0x73dc1683, 0xe3630b12, 0x94643b84,
-  0x0d6d6a3e, 0x7a6a5aa8, 0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1,
-  0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe, 0xf762575d, 0x806567cb,
-  0x196c3671, 0x6e6b06e7, 0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc,
-  0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5, 0xd6d6a3e8, 0xa1d1937e,
-  0x38d8c2c4, 0x4fdff252, 0xd1bb67f1, 0xa6bc5767, 0x3fb506dd, 0x48b2364b,
-  0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60, 0xdf60efc3, 0xa867df55,
-  0x316e8eef, 0x4669be79, 0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236,
-  0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f, 0xc5ba3bbe, 0xb2bd0b28,
-  0x2bb45a92, 0x5cb36a04, 0xc2d7ffa7, 0xb5d0cf31, 0x2cd99e8b, 0x5bdeae1d,
-  0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a, 0x9c0906a9, 0xeb0e363f,
-  0x72076785, 0x05005713, 0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38,
-  0x92d28e9b, 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21, 0x86d3d2d4, 0xf1d4e242,
-  0x68ddb3f8, 0x1fda836e, 0x81be16cd, 0xf6b9265b, 0x6fb077e1, 0x18b74777,
-  0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c, 0x8f659eff, 0xf862ae69,
-  0x616bffd3, 0x166ccf45, 0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2,
-  0xa7672661, 0xd06016f7, 0x4969474d, 0x3e6e77db, 0xaed16a4a, 0xd9d65adc,
-  0x40df0b66, 0x37d83bf0, 0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9,
-  0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, 0xbad03605, 0xcdd70693,
-  0x54de5729, 0x23d967bf, 0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94,
-  0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d
-}};
-
-const char *const ISO_HDLC_TARGET = "x86_64_avx512_vpclmulqdq_v3x2";
-
-const char *get_iso_hdlc_target() {
-    return ISO_HDLC_TARGET;
-}
-
-CRC_AINLINE uint32_t crc_u8(uint32_t crc, uint8_t val) {
-  return (crc >> 8) ^ g_crc_table[0][(crc & 0xFF) ^ val];
-}
-
-CRC_AINLINE uint32_t crc_u64(uint32_t crc, uint64_t val) {
-  __m128i k = _mm_setr_epi32(0xf7011641, 0xb4e5b025, 0xdb710641, 1);
-  __m128i a = _mm_cvtsi64_si128(crc ^ val);
-  __m128i b = _mm_clmulepi64_si128(a, k, 0x00);
-  __m128i c = _mm_clmulepi64_si128(b, k, 0x10);
-  return _mm_extract_epi32(c, 2);
-}
-
-#define clmul_lo(a, b) (_mm512_clmulepi64_epi128((a), (b), 0))
-#define clmul_hi(a, b) (_mm512_clmulepi64_epi128((a), (b), 17))
-
-CRC_EXPORT uint32_t crc32_iso_hdlc_impl(uint32_t crc0, const char* buf, size_t len) {
-  crc0 = ~crc0;
-  for (; len && ((uintptr_t)buf & 7); --len) {
-    crc0 = crc_u8(crc0, *buf++);
-  }
-  while (((uintptr_t)buf & 56) && len >= 8) {
-    crc0 = crc_u64(crc0, *(const uint64_t*)buf);
-    buf += 8;
-    len -= 8;
-  }
-  if (len >= 384) {
-    __m128i z0;
-    /* First vector chunk. */
-    __m512i x0 = _mm512_loadu_si512((const void*)buf), y0;
-    __m512i x1 = _mm512_loadu_si512((const void*)(buf + 64)), y1;
-    __m512i x2 = _mm512_loadu_si512((const void*)(buf + 128)), y2;
-    __m512i k;
-    k = _mm512_broadcast_i32x4(_mm_setr_epi32(0x596c8d81, 0, 0xf5e48c85, 0));
-    x0 = _mm512_xor_si512(_mm512_castsi128_si512(_mm_cvtsi32_si128(crc0)), x0);
-    y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
-    y1 = clmul_lo(x1, k), x1 = clmul_hi(x1, k);
-    y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k);
-    x0 = _mm512_ternarylogic_epi64(x0, y0, _mm512_loadu_si512((const void*)(buf + 192)), 0x96);
-    x1 = _mm512_ternarylogic_epi64(x1, y1, _mm512_loadu_si512((const void*)(buf + 256)), 0x96);
-    x2 = _mm512_ternarylogic_epi64(x2, y2, _mm512_loadu_si512((const void*)(buf + 320)), 0x96);
-    buf += 384;
-    len -= 384;
-    /* Main loop. */
-    while (len >= 384) {
-      y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
-      y1 = clmul_lo(x1, k), x1 = clmul_hi(x1, k);
-      y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k);
-      x0 = _mm512_ternarylogic_epi64(x0, y0, _mm512_loadu_si512((const void*)buf), 0x96);
-      x1 = _mm512_ternarylogic_epi64(x1, y1, _mm512_loadu_si512((const void*)(buf + 64)), 0x96);
-      x2 = _mm512_ternarylogic_epi64(x2, y2, _mm512_loadu_si512((const void*)(buf + 128)), 0x96);
-      y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
-      y1 = clmul_lo(x1, k), x1 = clmul_hi(x1, k);
-      y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k);
-      x0 = _mm512_ternarylogic_epi64(x0, y0, _mm512_loadu_si512((const void*)(buf + 192)), 0x96);
-      x1 = _mm512_ternarylogic_epi64(x1, y1, _mm512_loadu_si512((const void*)(buf + 256)), 0x96);
-      x2 = _mm512_ternarylogic_epi64(x2, y2, _mm512_loadu_si512((const void*)(buf + 320)), 0x96);
-      buf += 384;
-      len -= 384;
-    }
-    /* Reduce x0 ... x2 to just x0. */
-    k = _mm512_broadcast_i32x4(_mm_setr_epi32(0x8f352d95, 0, 0x1d9513d7, 0));
-    y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
-    x0 = _mm512_ternarylogic_epi64(x0, y0, x1, 0x96);
-    x1 = x2;
-    y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
-    x0 = _mm512_ternarylogic_epi64(x0, y0, x1, 0x96);
-    /* Reduce 512 bits to 128 bits. */
-    k = _mm512_setr_epi32(0x3db1ecdc, 0, 0xaf449247, 0, 0xf1da05aa, 0, 0x81256527, 0, 0xae689191, 0, 0xccaa009e, 0, 0, 0, 0, 0);
-    y0 = clmul_lo(x0, k), k = clmul_hi(x0, k);
-    y0 = _mm512_xor_si512(y0, k);
-    z0 = _mm_ternarylogic_epi64(_mm512_castsi512_si128(y0), _mm512_extracti32x4_epi32(y0, 1), _mm512_extracti32x4_epi32(y0, 2), 0x96);
-    z0 = _mm_xor_si128(z0, _mm512_extracti32x4_epi32(x0, 3));
-    /* Reduce 128 bits to 32 bits, and multiply by x^32. */
-    crc0 = crc_u64(0, _mm_extract_epi64(z0, 0));
-    crc0 = crc_u64(crc0, _mm_extract_epi64(z0, 1));
-  }
-  for (; len >= 8; buf += 8, len -= 8) {
-    crc0 = crc_u64(crc0, *(const uint64_t*)buf);
-  }
-  for (; len; --len) {
-    crc0 = crc_u8(crc0, *buf++);
-  }
-  return ~crc0;
-}
diff --git a/include/crc32_iso_hdlc_neon_blended.c b/include/crc32_iso_hdlc_neon_blended.c
deleted file mode 100644
index 9c1ff8e..0000000
--- a/include/crc32_iso_hdlc_neon_blended.c
+++ /dev/null
@@ -1,319 +0,0 @@
-/* Generated by https://github.com/corsix/fast-crc32/ */
-/* Modified post-generation to improve function names, include build targets,
-   and bifurcate large (>1KiB) and small payloads for optimized performance */
-/* MIT licensed */
-
-#include <stddef.h>
-#include <stdint.h>
-#include <arm_acle.h>
-#include <arm_neon.h>
-
-#if defined(_MSC_VER)
-#define CRC_AINLINE static __forceinline
-#define CRC_ALIGN(n) __declspec(align(n))
-#else
-#define CRC_AINLINE static __inline __attribute__((always_inline))
-#define CRC_ALIGN(n) __attribute__((aligned(n)))
-#endif
-#define CRC_EXPORT extern
-
-const char *const ISO_HDLC_TARGET = "aarch64_neon_blended";
-
-const char *get_iso_hdlc_target() {
-    return ISO_HDLC_TARGET;
-}
-
-CRC_AINLINE uint64x2_t clmul_lo_eor3(uint64x2_t a, uint64x2_t b) {
-  uint64x2_t r;
-  __asm("pmull %0.1q, %1.1d, %2.1d\n" : "=w"(r) : "w"(a), "w"(b));
-  return r;
-}
-
-CRC_AINLINE uint64x2_t clmul_hi_eor3(uint64x2_t a, uint64x2_t b) {
-  uint64x2_t r;
-  __asm("pmull2 %0.1q, %1.2d, %2.2d\n" : "=w"(r) : "w"(a), "w"(b));
-  return r;
-}
-
-CRC_AINLINE uint64x2_t clmul_scalar(uint32_t a, uint32_t b) {
-  uint64x2_t r;
-  __asm("pmull %0.1q, %1.1d, %2.1d\n" : "=w"(r) : "w"(vmovq_n_u64(a)), "w"(vmovq_n_u64(b)));
-  return r;
-}
-
-static uint32_t xnmodp(uint64_t n) /* x^n mod P, in log(n) time */ {
-  uint64_t stack = ~(uint64_t)1;
-  uint32_t acc, low;
-  for (; n > 191; n = (n >> 1) - 16) {
-    stack = (stack << 1) + (n & 1);
-  }
-  stack = ~stack;
-  acc = ((uint32_t)0x80000000) >> (n & 31);
-  for (n >>= 5; n; --n) {
-    acc = __crc32w(acc, 0);
-  }
-  while ((low = stack & 1), stack >>= 1) {
-    poly8x8_t x = vreinterpret_p8_u64(vmov_n_u64(acc));
-    uint64_t y = vgetq_lane_u64(vreinterpretq_u64_p16(vmull_p8(x, x)), 0);
-    acc = __crc32d(0, y << low);
-  }
-  return acc;
-}
-
-CRC_AINLINE uint64x2_t crc_shift(uint32_t crc, size_t nbytes) {
-  return clmul_scalar(crc, xnmodp(nbytes * 8 - 33));
-}
-
-CRC_AINLINE uint32_t crc32_iso_hdlc_large_impl(uint32_t crc0, const char* buf, size_t len) {
-  crc0 = ~crc0;
-  for (; len && ((uintptr_t)buf & 7); --len) {
-    crc0 = __crc32b(crc0, *buf++);
-  }
-  if (((uintptr_t)buf & 8) && len >= 8) {
-    crc0 = __crc32d(crc0, *(const uint64_t*)buf);
-    buf += 8;
-    len -= 8;
-  }
-  if (len >= 192) {
-    const char* end = buf + len;
-    size_t blk = (len - 0) / 192;
-    size_t klen = blk * 16;
-    const char* buf2 = buf + klen * 3;
-    const char* limit = buf + klen - 32;
-    uint32_t crc1 = 0;
-    uint32_t crc2 = 0;
-    uint64x2_t vc0;
-    uint64x2_t vc1;
-    uint64x2_t vc2;
-    uint64_t vc;
-    /* First vector chunk. */
-    uint64x2_t x0 = vld1q_u64((const uint64_t*)buf2), y0;
-    uint64x2_t x1 = vld1q_u64((const uint64_t*)(buf2 + 16)), y1;
-    uint64x2_t x2 = vld1q_u64((const uint64_t*)(buf2 + 32)), y2;
-    uint64x2_t x3 = vld1q_u64((const uint64_t*)(buf2 + 48)), y3;
-    uint64x2_t x4 = vld1q_u64((const uint64_t*)(buf2 + 64)), y4;
-    uint64x2_t x5 = vld1q_u64((const uint64_t*)(buf2 + 80)), y5;
-    uint64x2_t x6 = vld1q_u64((const uint64_t*)(buf2 + 96)), y6;
-    uint64x2_t x7 = vld1q_u64((const uint64_t*)(buf2 + 112)), y7;
-    uint64x2_t x8 = vld1q_u64((const uint64_t*)(buf2 + 128)), y8;
-    uint64x2_t k;
-    { static const uint64_t CRC_ALIGN(16) k_[] = {0x26b70c3d, 0x3f41287a}; k = vld1q_u64(k_); }
-    buf2 += 144;
-    /* Main loop. */
-    while (buf <= limit) {
-      y0 = clmul_lo_eor3(x0, k), x0 = clmul_hi_eor3(x0, k);
-      y1 = clmul_lo_eor3(x1, k), x1 = clmul_hi_eor3(x1, k);
-      y2 = clmul_lo_eor3(x2, k), x2 = clmul_hi_eor3(x2, k);
-      y3 = clmul_lo_eor3(x3, k), x3 = clmul_hi_eor3(x3, k);
-      y4 = clmul_lo_eor3(x4, k), x4 = clmul_hi_eor3(x4, k);
-      y5 = clmul_lo_eor3(x5, k), x5 = clmul_hi_eor3(x5, k);
-      y6 = clmul_lo_eor3(x6, k), x6 = clmul_hi_eor3(x6, k);
-      y7 = clmul_lo_eor3(x7, k), x7 = clmul_hi_eor3(x7, k);
-      y8 = clmul_lo_eor3(x8, k), x8 = clmul_hi_eor3(x8, k);
-      x0 = veor3q_u64(x0, y0, vld1q_u64((const uint64_t*)buf2));
-      x1 = veor3q_u64(x1, y1, vld1q_u64((const uint64_t*)(buf2 + 16)));
-      x2 = veor3q_u64(x2, y2, vld1q_u64((const uint64_t*)(buf2 + 32)));
-      x3 = veor3q_u64(x3, y3, vld1q_u64((const uint64_t*)(buf2 + 48)));
-      x4 = veor3q_u64(x4, y4, vld1q_u64((const uint64_t*)(buf2 + 64)));
-      x5 = veor3q_u64(x5, y5, vld1q_u64((const uint64_t*)(buf2 + 80)));
-      x6 = veor3q_u64(x6, y6, vld1q_u64((const uint64_t*)(buf2 + 96)));
-      x7 = veor3q_u64(x7, y7, vld1q_u64((const uint64_t*)(buf2 + 112)));
-      x8 = veor3q_u64(x8, y8, vld1q_u64((const uint64_t*)(buf2 + 128)));
-      crc0 = __crc32d(crc0, *(const uint64_t*)buf);
-      crc1 = __crc32d(crc1, *(const uint64_t*)(buf + klen));
-      crc2 = __crc32d(crc2, *(const uint64_t*)(buf + klen * 2));
-      crc0 = __crc32d(crc0, *(const uint64_t*)(buf + 8));
-      crc1 = __crc32d(crc1, *(const uint64_t*)(buf + klen + 8));
-      crc2 = __crc32d(crc2, *(const uint64_t*)(buf + klen * 2 + 8));
-      buf += 16;
-      buf2 += 144;
-    }
-    /* Reduce x0 ... x8 to just x0. */
-    { static const uint64_t CRC_ALIGN(16) k_[] = {0xae689191, 0xccaa009e}; k = vld1q_u64(k_); }
-    y0 = clmul_lo_eor3(x0, k), x0 = clmul_hi_eor3(x0, k);
-    x0 = veor3q_u64(x0, y0, x1);
-    x1 = x2, x2 = x3, x3 = x4, x4 = x5, x5 = x6, x6 = x7, x7 = x8;
-    y0 = clmul_lo_eor3(x0, k), x0 = clmul_hi_eor3(x0, k);
-    y2 = clmul_lo_eor3(x2, k), x2 = clmul_hi_eor3(x2, k);
-    y4 = clmul_lo_eor3(x4, k), x4 = clmul_hi_eor3(x4, k);
-    y6 = clmul_lo_eor3(x6, k), x6 = clmul_hi_eor3(x6, k);
-    x0 = veor3q_u64(x0, y0, x1);
-    x2 = veor3q_u64(x2, y2, x3);
-    x4 = veor3q_u64(x4, y4, x5);
-    x6 = veor3q_u64(x6, y6, x7);
-    { static const uint64_t CRC_ALIGN(16) k_[] = {0xf1da05aa, 0x81256527}; k = vld1q_u64(k_); }
-    y0 = clmul_lo_eor3(x0, k), x0 = clmul_hi_eor3(x0, k);
-    y4 = clmul_lo_eor3(x4, k), x4 = clmul_hi_eor3(x4, k);
-    x0 = veor3q_u64(x0, y0, x2);
-    x4 = veor3q_u64(x4, y4, x6);
-    { static const uint64_t CRC_ALIGN(16) k_[] = {0x8f352d95, 0x1d9513d7}; k = vld1q_u64(k_); }
-    y0 = clmul_lo_eor3(x0, k), x0 = clmul_hi_eor3(x0, k);
-    x0 = veor3q_u64(x0, y0, x4);
-    /* Final scalar chunk. */
-    crc0 = __crc32d(crc0, *(const uint64_t*)buf);
-    crc1 = __crc32d(crc1, *(const uint64_t*)(buf + klen));
-    crc2 = __crc32d(crc2, *(const uint64_t*)(buf + klen * 2));
-    crc0 = __crc32d(crc0, *(const uint64_t*)(buf + 8));
-    crc1 = __crc32d(crc1, *(const uint64_t*)(buf + klen + 8));
-    crc2 = __crc32d(crc2, *(const uint64_t*)(buf + klen * 2 + 8));
-    vc0 = crc_shift(crc0, klen * 2 + blk * 144);
-    vc1 = crc_shift(crc1, klen + blk * 144);
-    vc2 = crc_shift(crc2, 0 + blk * 144);
-    vc = vgetq_lane_u64(veor3q_u64(vc0, vc1, vc2), 0);
-    /* Reduce 128 bits to 32 bits, and multiply by x^32. */
-    crc0 = __crc32d(0, vgetq_lane_u64(x0, 0));
-    crc0 = __crc32d(crc0, vc ^ vgetq_lane_u64(x0, 1));
-    buf = buf2;
-    len = end - buf;
-  }
-  if (len >= 32) {
-    size_t klen = ((len - 8) / 24) * 8;
-    uint32_t crc1 = 0;
-    uint32_t crc2 = 0;
-    uint64x2_t vc0;
-    uint64x2_t vc1;
-    uint64_t vc;
-    /* Main loop. */
-    do {
-      crc0 = __crc32d(crc0, *(const uint64_t*)buf);
-      crc1 = __crc32d(crc1, *(const uint64_t*)(buf + klen));
-      crc2 = __crc32d(crc2, *(const uint64_t*)(buf + klen * 2));
-      buf += 8;
-      len -= 24;
-    } while (len >= 32);
-    vc0 = crc_shift(crc0, klen * 2 + 8);
-    vc1 = crc_shift(crc1, klen + 8);
-    vc = vgetq_lane_u64(veorq_u64(vc0, vc1), 0);
-    /* Final 8 bytes. */
-    buf += klen * 2;
-    crc0 = crc2;
-    crc0 = __crc32d(crc0, *(const uint64_t*)buf ^ vc), buf += 8;
-    len -= 8;
-  }
-  for (; len >= 8; buf += 8, len -= 8) {
-    crc0 = __crc32d(crc0, *(const uint64_t*)buf);
-  }
-  for (; len; --len) {
-    crc0 = __crc32b(crc0, *buf++);
-  }
-  return ~crc0;
-}
-
-CRC_AINLINE uint64x2_t clmul_lo_e(uint64x2_t a, uint64x2_t b, uint64x2_t c) {
-  uint64x2_t r;
-  __asm("pmull %0.1q, %2.1d, %3.1d\neor %0.16b, %0.16b, %1.16b\n" : "=w"(r), "+w"(c) : "w"(a), "w"(b));
-  return r;
-}
-
-CRC_AINLINE uint64x2_t clmul_hi_e(uint64x2_t a, uint64x2_t b, uint64x2_t c) {
-  uint64x2_t r;
-  __asm("pmull2 %0.1q, %2.2d, %3.2d\neor %0.16b, %0.16b, %1.16b\n" : "=w"(r), "+w"(c) : "w"(a), "w"(b));
-  return r;
-}
-
-CRC_AINLINE uint32_t crc32_iso_hdlc_small_impl(uint32_t crc0, const char* buf, size_t len) {
-  crc0 = ~crc0;
-  for (; len && ((uintptr_t)buf & 7); --len) {
-    crc0 = __crc32b(crc0, *buf++);
-  }
-  if (((uintptr_t)buf & 8) && len >= 8) {
-    crc0 = __crc32d(crc0, *(const uint64_t*)buf);
-    buf += 8;
-    len -= 8;
-  }
-  if (len >= 192) {
-    const char* end = buf + len;
-    const char* limit = buf + len - 192;
-    /* First vector chunk. */
-    uint64x2_t x0 = vld1q_u64((const uint64_t*)buf), y0;
-    uint64x2_t x1 = vld1q_u64((const uint64_t*)(buf + 16)), y1;
-    uint64x2_t x2 = vld1q_u64((const uint64_t*)(buf + 32)), y2;
-    uint64x2_t x3 = vld1q_u64((const uint64_t*)(buf + 48)), y3;
-    uint64x2_t x4 = vld1q_u64((const uint64_t*)(buf + 64)), y4;
-    uint64x2_t x5 = vld1q_u64((const uint64_t*)(buf + 80)), y5;
-    uint64x2_t x6 = vld1q_u64((const uint64_t*)(buf + 96)), y6;
-    uint64x2_t x7 = vld1q_u64((const uint64_t*)(buf + 112)), y7;
-    uint64x2_t x8 = vld1q_u64((const uint64_t*)(buf + 128)), y8;
-    uint64x2_t x9 = vld1q_u64((const uint64_t*)(buf + 144)), y9;
-    uint64x2_t x10 = vld1q_u64((const uint64_t*)(buf + 160)), y10;
-    uint64x2_t x11 = vld1q_u64((const uint64_t*)(buf + 176)), y11;
-    uint64x2_t k;
-    { static const uint64_t CRC_ALIGN(16) k_[] = {0x596c8d81, 0xf5e48c85}; k = vld1q_u64(k_); }
-    x0 = veorq_u64((uint64x2_t){crc0, 0}, x0);
-    buf += 192;
-    /* Main loop. */
-    while (buf <= limit) {
-      y0 = clmul_lo_e(x0, k, vld1q_u64((const uint64_t*)buf)), x0 = clmul_hi_e(x0, k, y0);
-      y1 = clmul_lo_e(x1, k, vld1q_u64((const uint64_t*)(buf + 16))), x1 = clmul_hi_e(x1, k, y1);
-      y2 = clmul_lo_e(x2, k, vld1q_u64((const uint64_t*)(buf + 32))), x2 = clmul_hi_e(x2, k, y2);
-      y3 = clmul_lo_e(x3, k, vld1q_u64((const uint64_t*)(buf + 48))), x3 = clmul_hi_e(x3, k, y3);
-      y4 = clmul_lo_e(x4, k, vld1q_u64((const uint64_t*)(buf + 64))), x4 = clmul_hi_e(x4, k, y4);
-      y5 = clmul_lo_e(x5, k, vld1q_u64((const uint64_t*)(buf + 80))), x5 = clmul_hi_e(x5, k, y5);
-      y6 = clmul_lo_e(x6, k, vld1q_u64((const uint64_t*)(buf + 96))), x6 = clmul_hi_e(x6, k, y6);
-      y7 = clmul_lo_e(x7, k, vld1q_u64((const uint64_t*)(buf + 112))), x7 = clmul_hi_e(x7, k, y7);
-      y8 = clmul_lo_e(x8, k, vld1q_u64((const uint64_t*)(buf + 128))), x8 = clmul_hi_e(x8, k, y8);
-      y9 = clmul_lo_e(x9, k, vld1q_u64((const uint64_t*)(buf + 144))), x9 = clmul_hi_e(x9, k, y9);
-      y10 = clmul_lo_e(x10, k, vld1q_u64((const uint64_t*)(buf + 160))), x10 = clmul_hi_e(x10, k, y10);
-      y11 = clmul_lo_e(x11, k, vld1q_u64((const uint64_t*)(buf + 176))), x11 = clmul_hi_e(x11, k, y11);
-      buf += 192;
-    }
-    /* Reduce x0 ... x11 to just x0. */
-    { static const uint64_t CRC_ALIGN(16) k_[] = {0xae689191, 0xccaa009e}; k = vld1q_u64(k_); }
-    y0 = clmul_lo_e(x0, k, x1), x0 = clmul_hi_e(x0, k, y0);
-    y2 = clmul_lo_e(x2, k, x3), x2 = clmul_hi_e(x2, k, y2);
-    y4 = clmul_lo_e(x4, k, x5), x4 = clmul_hi_e(x4, k, y4);
-    y6 = clmul_lo_e(x6, k, x7), x6 = clmul_hi_e(x6, k, y6);
-    y8 = clmul_lo_e(x8, k, x9), x8 = clmul_hi_e(x8, k, y8);
-    y10 = clmul_lo_e(x10, k, x11), x10 = clmul_hi_e(x10, k, y10);
-    { static const uint64_t CRC_ALIGN(16) k_[] = {0xf1da05aa, 0x81256527}; k = vld1q_u64(k_); }
-    y0 = clmul_lo_e(x0, k, x2), x0 = clmul_hi_e(x0, k, y0);
-    y4 = clmul_lo_e(x4, k, x6), x4 = clmul_hi_e(x4, k, y4);
-    y8 = clmul_lo_e(x8, k, x10), x8 = clmul_hi_e(x8, k, y8);
-    { static const uint64_t CRC_ALIGN(16) k_[] = {0x8f352d95, 0x1d9513d7}; k = vld1q_u64(k_); }
-    y0 = clmul_lo_e(x0, k, x4), x0 = clmul_hi_e(x0, k, y0);
-    x4 = x8;
-    y0 = clmul_lo_e(x0, k, x4), x0 = clmul_hi_e(x0, k, y0);
-    /* Reduce 128 bits to 32 bits, and multiply by x^32. */
-    crc0 = __crc32d(0, vgetq_lane_u64(x0, 0));
-    crc0 = __crc32d(crc0, vgetq_lane_u64(x0, 1));
-    len = end - buf;
-  }
-  if (len >= 16) {
-    /* First vector chunk. */
-    uint64x2_t x0 = vld1q_u64((const uint64_t*)buf), y0;
-    uint64x2_t k;
-    { static const uint64_t CRC_ALIGN(16) k_[] = {0xae689191, 0xccaa009e}; k = vld1q_u64(k_); }
-    x0 = veorq_u64((uint64x2_t){crc0, 0}, x0);
-    buf += 16;
-    len -= 16;
-    /* Main loop. */
-    while (len >= 16) {
-      y0 = clmul_lo_e(x0, k, vld1q_u64((const uint64_t*)buf)), x0 = clmul_hi_e(x0, k, y0);
-      buf += 16;
-      len -= 16;
-    }
-    /* Reduce 128 bits to 32 bits, and multiply by x^32. */
-    crc0 = __crc32d(0, vgetq_lane_u64(x0, 0));
-    crc0 = __crc32d(crc0, vgetq_lane_u64(x0, 1));
-  }
-  for (; len >= 8; buf += 8, len -= 8) {
-    crc0 = __crc32d(crc0, *(const uint64_t*)buf);
-  }
-  for (; len; --len) {
-    crc0 = __crc32b(crc0, *buf++);
-  }
-  return ~crc0;
-}
-
-CRC_EXPORT uint32_t crc32_iso_hdlc_impl(uint32_t crc0, const char* buf, size_t len) {
-    // Define 1 KiB threshold (1024 bytes)
-    const size_t LARGE_BUFFER_THRESHOLD = 1024;
-
-    // Select implementation based on buffer size
-    if (len <= LARGE_BUFFER_THRESHOLD) {
-        return crc32_iso_hdlc_small_impl(crc0, buf, len);
-    } else {
-        return crc32_iso_hdlc_large_impl(crc0, buf, len);
-    }
-}
-
diff --git a/include/crc32_iso_hdlc_neon_eor3_v9s3x2e_s3.c b/include/crc32_iso_hdlc_neon_eor3_v9s3x2e_s3.c
deleted file mode 100644
index 8197fb3..0000000
--- a/include/crc32_iso_hdlc_neon_eor3_v9s3x2e_s3.c
+++ /dev/null
@@ -1,200 +0,0 @@
-/* Generated by https://github.com/corsix/fast-crc32/ using: */
-/* ./generate -i neon_eor3 -p crc32 -a v9s3x2e_s3 */
-/* Modified slightly post-generation to improve function name and include build target */
-/* MIT licensed */
-
-#include <stddef.h>
-#include <stdint.h>
-#include <arm_acle.h>
-#include <arm_neon.h>
-
-#if defined(_MSC_VER)
-#define CRC_AINLINE static __forceinline
-#define CRC_ALIGN(n) __declspec(align(n))
-#else
-#define CRC_AINLINE static __inline __attribute__((always_inline))
-#define CRC_ALIGN(n) __attribute__((aligned(n)))
-#endif
-#define CRC_EXPORT extern
-
-const char *const ISO_HDLC_TARGET = "aarch64_neon_eor3_v9s3x2e_s3";
-
-const char *get_iso_hdlc_target() {
-    return ISO_HDLC_TARGET;
-}
-
-CRC_AINLINE uint64x2_t clmul_lo(uint64x2_t a, uint64x2_t b) {
-  uint64x2_t r;
-  __asm("pmull %0.1q, %1.1d, %2.1d\n" : "=w"(r) : "w"(a), "w"(b));
-  return r;
-}
-
-CRC_AINLINE uint64x2_t clmul_hi(uint64x2_t a, uint64x2_t b) {
-  uint64x2_t r;
-  __asm("pmull2 %0.1q, %1.2d, %2.2d\n" : "=w"(r) : "w"(a), "w"(b));
-  return r;
-}
-
-CRC_AINLINE uint64x2_t clmul_scalar(uint32_t a, uint32_t b) {
-  uint64x2_t r;
-  __asm("pmull %0.1q, %1.1d, %2.1d\n" : "=w"(r) : "w"(vmovq_n_u64(a)), "w"(vmovq_n_u64(b)));
-  return r;
-}
-
-static uint32_t xnmodp(uint64_t n) /* x^n mod P, in log(n) time */ {
-  uint64_t stack = ~(uint64_t)1;
-  uint32_t acc, low;
-  for (; n > 191; n = (n >> 1) - 16) {
-    stack = (stack << 1) + (n & 1);
-  }
-  stack = ~stack;
-  acc = ((uint32_t)0x80000000) >> (n & 31);
-  for (n >>= 5; n; --n) {
-    acc = __crc32w(acc, 0);
-  }
-  while ((low = stack & 1), stack >>= 1) {
-    poly8x8_t x = vreinterpret_p8_u64(vmov_n_u64(acc));
-    uint64_t y = vgetq_lane_u64(vreinterpretq_u64_p16(vmull_p8(x, x)), 0);
-    acc = __crc32d(0, y << low);
-  }
-  return acc;
-}
-
-CRC_AINLINE uint64x2_t crc_shift(uint32_t crc, size_t nbytes) {
-  return clmul_scalar(crc, xnmodp(nbytes * 8 - 33));
-}
-
-CRC_EXPORT uint32_t crc32_iso_hdlc_impl(uint32_t crc0, const char* buf, size_t len) {
-  crc0 = ~crc0;
-  for (; len && ((uintptr_t)buf & 7); --len) {
-    crc0 = __crc32b(crc0, *buf++);
-  }
-  if (((uintptr_t)buf & 8) && len >= 8) {
-    crc0 = __crc32d(crc0, *(const uint64_t*)buf);
-    buf += 8;
-    len -= 8;
-  }
-  if (len >= 192) {
-    const char* end = buf + len;
-    size_t blk = (len - 0) / 192;
-    size_t klen = blk * 16;
-    const char* buf2 = buf + klen * 3;
-    const char* limit = buf + klen - 32;
-    uint32_t crc1 = 0;
-    uint32_t crc2 = 0;
-    uint64x2_t vc0;
-    uint64x2_t vc1;
-    uint64x2_t vc2;
-    uint64_t vc;
-    /* First vector chunk. */
-    uint64x2_t x0 = vld1q_u64((const uint64_t*)buf2), y0;
-    uint64x2_t x1 = vld1q_u64((const uint64_t*)(buf2 + 16)), y1;
-    uint64x2_t x2 = vld1q_u64((const uint64_t*)(buf2 + 32)), y2;
-    uint64x2_t x3 = vld1q_u64((const uint64_t*)(buf2 + 48)), y3;
-    uint64x2_t x4 = vld1q_u64((const uint64_t*)(buf2 + 64)), y4;
-    uint64x2_t x5 = vld1q_u64((const uint64_t*)(buf2 + 80)), y5;
-    uint64x2_t x6 = vld1q_u64((const uint64_t*)(buf2 + 96)), y6;
-    uint64x2_t x7 = vld1q_u64((const uint64_t*)(buf2 + 112)), y7;
-    uint64x2_t x8 = vld1q_u64((const uint64_t*)(buf2 + 128)), y8;
-    uint64x2_t k;
-    { static const uint64_t CRC_ALIGN(16) k_[] = {0x26b70c3d, 0x3f41287a}; k = vld1q_u64(k_); }
-    buf2 += 144;
-    /* Main loop. */
-    while (buf <= limit) {
-      y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
-      y1 = clmul_lo(x1, k), x1 = clmul_hi(x1, k);
-      y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k);
-      y3 = clmul_lo(x3, k), x3 = clmul_hi(x3, k);
-      y4 = clmul_lo(x4, k), x4 = clmul_hi(x4, k);
-      y5 = clmul_lo(x5, k), x5 = clmul_hi(x5, k);
-      y6 = clmul_lo(x6, k), x6 = clmul_hi(x6, k);
-      y7 = clmul_lo(x7, k), x7 = clmul_hi(x7, k);
-      y8 = clmul_lo(x8, k), x8 = clmul_hi(x8, k);
-      x0 = veor3q_u64(x0, y0, vld1q_u64((const uint64_t*)buf2));
-      x1 = veor3q_u64(x1, y1, vld1q_u64((const uint64_t*)(buf2 + 16)));
-      x2 = veor3q_u64(x2, y2, vld1q_u64((const uint64_t*)(buf2 + 32)));
-      x3 = veor3q_u64(x3, y3, vld1q_u64((const uint64_t*)(buf2 + 48)));
-      x4 = veor3q_u64(x4, y4, vld1q_u64((const uint64_t*)(buf2 + 64)));
-      x5 = veor3q_u64(x5, y5, vld1q_u64((const uint64_t*)(buf2 + 80)));
-      x6 = veor3q_u64(x6, y6, vld1q_u64((const uint64_t*)(buf2 + 96)));
-      x7 = veor3q_u64(x7, y7, vld1q_u64((const uint64_t*)(buf2 + 112)));
-      x8 = veor3q_u64(x8, y8, vld1q_u64((const uint64_t*)(buf2 + 128)));
-      crc0 = __crc32d(crc0, *(const uint64_t*)buf);
-      crc1 = __crc32d(crc1, *(const uint64_t*)(buf + klen));
-      crc2 = __crc32d(crc2, *(const uint64_t*)(buf + klen * 2));
-      crc0 = __crc32d(crc0, *(const uint64_t*)(buf + 8));
-      crc1 = __crc32d(crc1, *(const uint64_t*)(buf + klen + 8));
-      crc2 = __crc32d(crc2, *(const uint64_t*)(buf + klen * 2 + 8));
-      buf += 16;
-      buf2 += 144;
-    }
-    /* Reduce x0 ... x8 to just x0. */
-    { static const uint64_t CRC_ALIGN(16) k_[] = {0xae689191, 0xccaa009e}; k = vld1q_u64(k_); }
-    y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
-    x0 = veor3q_u64(x0, y0, x1);
-    x1 = x2, x2 = x3, x3 = x4, x4 = x5, x5 = x6, x6 = x7, x7 = x8;
-    y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
-    y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k);
-    y4 = clmul_lo(x4, k), x4 = clmul_hi(x4, k);
-    y6 = clmul_lo(x6, k), x6 = clmul_hi(x6, k);
-    x0 = veor3q_u64(x0, y0, x1);
-    x2 = veor3q_u64(x2, y2, x3);
-    x4 = veor3q_u64(x4, y4, x5);
-    x6 = veor3q_u64(x6, y6, x7);
-    { static const uint64_t CRC_ALIGN(16) k_[] = {0xf1da05aa, 0x81256527}; k = vld1q_u64(k_); }
-    y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
-    y4 = clmul_lo(x4, k), x4 = clmul_hi(x4, k);
-    x0 = veor3q_u64(x0, y0, x2);
-    x4 = veor3q_u64(x4, y4, x6);
-    { static const uint64_t CRC_ALIGN(16) k_[] = {0x8f352d95, 0x1d9513d7}; k = vld1q_u64(k_); }
-    y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
-    x0 = veor3q_u64(x0, y0, x4);
-    /* Final scalar chunk. */
-    crc0 = __crc32d(crc0, *(const uint64_t*)buf);
-    crc1 = __crc32d(crc1, *(const uint64_t*)(buf + klen));
-    crc2 = __crc32d(crc2, *(const uint64_t*)(buf + klen * 2));
-    crc0 = __crc32d(crc0, *(const uint64_t*)(buf + 8));
-    crc1 = __crc32d(crc1, *(const uint64_t*)(buf + klen + 8));
-    crc2 = __crc32d(crc2, *(const uint64_t*)(buf + klen * 2 + 8));
-    vc0 = crc_shift(crc0, klen * 2 + blk * 144);
-    vc1 = crc_shift(crc1, klen + blk * 144);
-    vc2 = crc_shift(crc2, 0 + blk * 144);
-    vc = vgetq_lane_u64(veor3q_u64(vc0, vc1, vc2), 0);
-    /* Reduce 128 bits to 32 bits, and multiply by x^32. */
-    crc0 = __crc32d(0, vgetq_lane_u64(x0, 0));
-    crc0 = __crc32d(crc0, vc ^ vgetq_lane_u64(x0, 1));
-    buf = buf2;
-    len = end - buf;
-  }
-  if (len >= 32) {
-    size_t klen = ((len - 8) / 24) * 8;
-    uint32_t crc1 = 0;
-    uint32_t crc2 = 0;
-    uint64x2_t vc0;
-    uint64x2_t vc1;
-    uint64_t vc;
-    /* Main loop. */
-    do {
-      crc0 = __crc32d(crc0, *(const uint64_t*)buf);
-      crc1 = __crc32d(crc1, *(const uint64_t*)(buf + klen));
-      crc2 = __crc32d(crc2, *(const uint64_t*)(buf + klen * 2));
-      buf += 8;
-      len -= 24;
-    } while (len >= 32);
-    vc0 = crc_shift(crc0, klen * 2 + 8);
-    vc1 = crc_shift(crc1, klen + 8);
-    vc = vgetq_lane_u64(veorq_u64(vc0, vc1), 0);
-    /* Final 8 bytes. */
-    buf += klen * 2;
-    crc0 = crc2;
-    crc0 = __crc32d(crc0, *(const uint64_t*)buf ^ vc), buf += 8;
-    len -= 8;
-  }
-  for (; len >= 8; buf += 8, len -= 8) {
-    crc0 = __crc32d(crc0, *(const uint64_t*)buf);
-  }
-  for (; len; --len) {
-    crc0 = __crc32b(crc0, *buf++);
-  }
-  return ~crc0;
-}
diff --git a/include/crc32_iso_hdlc_neon_v12e_v1.c b/include/crc32_iso_hdlc_neon_v12e_v1.c
deleted file mode 100644
index 8ddafa6..0000000
--- a/include/crc32_iso_hdlc_neon_v12e_v1.c
+++ /dev/null
@@ -1,130 +0,0 @@
-/* Generated by https://github.com/corsix/fast-crc32/ using: */
-/* ./generate -i neon -p crc32 -a v12e_v1 */
-/* Modified slightly post-generation to improve function name and include build target */
-/* MIT licensed */
-
-#include <stddef.h>
-#include <stdint.h>
-#include <arm_acle.h>
-#include <arm_neon.h>
-
-#if defined(_MSC_VER)
-#define CRC_AINLINE static __forceinline
-#define CRC_ALIGN(n) __declspec(align(n))
-#else
-#define CRC_AINLINE static __inline __attribute__((always_inline))
-#define CRC_ALIGN(n) __attribute__((aligned(n)))
-#endif
-#define CRC_EXPORT extern
-
-const char *const ISO_HDLC_TARGET = "aarch64_neon_v12e_v1";
-
-const char *get_iso_hdlc_target() {
-    return ISO_HDLC_TARGET;
-}
-
-CRC_AINLINE uint64x2_t clmul_lo_e(uint64x2_t a, uint64x2_t b, uint64x2_t c) {
-  uint64x2_t r;
-  __asm("pmull %0.1q, %2.1d, %3.1d\neor %0.16b, %0.16b, %1.16b\n" : "=w"(r), "+w"(c) : "w"(a), "w"(b));
-  return r;
-}
-
-CRC_AINLINE uint64x2_t clmul_hi_e(uint64x2_t a, uint64x2_t b, uint64x2_t c) {
-  uint64x2_t r;
-  __asm("pmull2 %0.1q, %2.2d, %3.2d\neor %0.16b, %0.16b, %1.16b\n" : "=w"(r), "+w"(c) : "w"(a), "w"(b));
-  return r;
-}
-
-CRC_EXPORT uint32_t crc32_iso_hdlc_impl(uint32_t crc0, const char* buf, size_t len) {
-  crc0 = ~crc0;
-  for (; len && ((uintptr_t)buf & 7); --len) {
-    crc0 = __crc32b(crc0, *buf++);
-  }
-  if (((uintptr_t)buf & 8) && len >= 8) {
-    crc0 = __crc32d(crc0, *(const uint64_t*)buf);
-    buf += 8;
-    len -= 8;
-  }
-  if (len >= 192) {
-    const char* end = buf + len;
-    const char* limit = buf + len - 192;
-    /* First vector chunk. */
-    uint64x2_t x0 = vld1q_u64((const uint64_t*)buf), y0;
-    uint64x2_t x1 = vld1q_u64((const uint64_t*)(buf + 16)), y1;
-    uint64x2_t x2 = vld1q_u64((const uint64_t*)(buf + 32)), y2;
-    uint64x2_t x3 = vld1q_u64((const uint64_t*)(buf + 48)), y3;
-    uint64x2_t x4 = vld1q_u64((const uint64_t*)(buf + 64)), y4;
-    uint64x2_t x5 = vld1q_u64((const uint64_t*)(buf + 80)), y5;
-    uint64x2_t x6 = vld1q_u64((const uint64_t*)(buf + 96)), y6;
-    uint64x2_t x7 = vld1q_u64((const uint64_t*)(buf + 112)), y7;
-    uint64x2_t x8 = vld1q_u64((const uint64_t*)(buf + 128)), y8;
-    uint64x2_t x9 = vld1q_u64((const uint64_t*)(buf + 144)), y9;
-    uint64x2_t x10 = vld1q_u64((const uint64_t*)(buf + 160)), y10;
-    uint64x2_t x11 = vld1q_u64((const uint64_t*)(buf + 176)), y11;
-    uint64x2_t k;
-    { static const uint64_t CRC_ALIGN(16) k_[] = {0x596c8d81, 0xf5e48c85}; k = vld1q_u64(k_); }
-    x0 = veorq_u64((uint64x2_t){crc0, 0}, x0);
-    buf += 192;
-    /* Main loop. */
-    while (buf <= limit) {
-      y0 = clmul_lo_e(x0, k, vld1q_u64((const uint64_t*)buf)), x0 = clmul_hi_e(x0, k, y0);
-      y1 = clmul_lo_e(x1, k, vld1q_u64((const uint64_t*)(buf + 16))), x1 = clmul_hi_e(x1, k, y1);
-      y2 = clmul_lo_e(x2, k, vld1q_u64((const uint64_t*)(buf + 32))), x2 = clmul_hi_e(x2, k, y2);
-      y3 = clmul_lo_e(x3, k, vld1q_u64((const uint64_t*)(buf + 48))), x3 = clmul_hi_e(x3, k, y3);
-      y4 = clmul_lo_e(x4, k, vld1q_u64((const uint64_t*)(buf + 64))), x4 = clmul_hi_e(x4, k, y4);
-      y5 = clmul_lo_e(x5, k, vld1q_u64((const uint64_t*)(buf + 80))), x5 = clmul_hi_e(x5, k, y5);
-      y6 = clmul_lo_e(x6, k, vld1q_u64((const uint64_t*)(buf + 96))), x6 = clmul_hi_e(x6, k, y6);
-      y7 = clmul_lo_e(x7, k, vld1q_u64((const uint64_t*)(buf + 112))), x7 = clmul_hi_e(x7, k, y7);
-      y8 = clmul_lo_e(x8, k, vld1q_u64((const uint64_t*)(buf + 128))), x8 = clmul_hi_e(x8, k, y8);
-      y9 = clmul_lo_e(x9, k, vld1q_u64((const uint64_t*)(buf + 144))), x9 = clmul_hi_e(x9, k, y9);
-      y10 = clmul_lo_e(x10, k, vld1q_u64((const uint64_t*)(buf + 160))), x10 = clmul_hi_e(x10, k, y10);
-      y11 = clmul_lo_e(x11, k, vld1q_u64((const uint64_t*)(buf + 176))), x11 = clmul_hi_e(x11, k, y11);
-      buf += 192;
-    }
-    /* Reduce x0 ... x11 to just x0. */
-    { static const uint64_t CRC_ALIGN(16) k_[] = {0xae689191, 0xccaa009e}; k = vld1q_u64(k_); }
-    y0 = clmul_lo_e(x0, k, x1), x0 = clmul_hi_e(x0, k, y0);
-    y2 = clmul_lo_e(x2, k, x3), x2 = clmul_hi_e(x2, k, y2);
-    y4 = clmul_lo_e(x4, k, x5), x4 = clmul_hi_e(x4, k, y4);
-    y6 = clmul_lo_e(x6, k, x7), x6 = clmul_hi_e(x6, k, y6);
-    y8 = clmul_lo_e(x8, k, x9), x8 = clmul_hi_e(x8, k, y8);
-    y10 = clmul_lo_e(x10, k, x11), x10 = clmul_hi_e(x10, k, y10);
-    { static const uint64_t CRC_ALIGN(16) k_[] = {0xf1da05aa, 0x81256527}; k = vld1q_u64(k_); }
-    y0 = clmul_lo_e(x0, k, x2), x0 = clmul_hi_e(x0, k, y0);
-    y4 = clmul_lo_e(x4, k, x6), x4 = clmul_hi_e(x4, k, y4);
-    y8 = clmul_lo_e(x8, k, x10), x8 = clmul_hi_e(x8, k, y8);
-    { static const uint64_t CRC_ALIGN(16) k_[] = {0x8f352d95, 0x1d9513d7}; k = vld1q_u64(k_); }
-    y0 = clmul_lo_e(x0, k, x4), x0 = clmul_hi_e(x0, k, y0);
-    x4 = x8;
-    y0 = clmul_lo_e(x0, k, x4), x0 = clmul_hi_e(x0, k, y0);
-    /* Reduce 128 bits to 32 bits, and multiply by x^32. */
-    crc0 = __crc32d(0, vgetq_lane_u64(x0, 0));
-    crc0 = __crc32d(crc0, vgetq_lane_u64(x0, 1));
-    len = end - buf;
-  }
-  if (len >= 16) {
-    /* First vector chunk. */
-    uint64x2_t x0 = vld1q_u64((const uint64_t*)buf), y0;
-    uint64x2_t k;
-    { static const uint64_t CRC_ALIGN(16) k_[] = {0xae689191, 0xccaa009e}; k = vld1q_u64(k_); }
-    x0 = veorq_u64((uint64x2_t){crc0, 0}, x0);
-    buf += 16;
-    len -= 16;
-    /* Main loop. */
-    while (len >= 16) {
-      y0 = clmul_lo_e(x0, k, vld1q_u64((const uint64_t*)buf)), x0 = clmul_hi_e(x0, k, y0);
-      buf += 16;
-      len -= 16;
-    }
-    /* Reduce 128 bits to 32 bits, and multiply by x^32. */
-    crc0 = __crc32d(0, vgetq_lane_u64(x0, 0));
-    crc0 = __crc32d(crc0, vgetq_lane_u64(x0, 1));
-  }
-  for (; len >= 8; buf += 8, len -= 8) {
-    crc0 = __crc32d(crc0, *(const uint64_t*)buf);
-  }
-  for (; len; --len) {
-    crc0 = __crc32b(crc0, *buf++);
-  }
-  return ~crc0;
-}
diff --git a/include/crc32_iso_hdlc_neon_v3s4x2e_v2.c b/include/crc32_iso_hdlc_neon_v3s4x2e_v2.c
deleted file mode 100644
index 5e23963..0000000
--- a/include/crc32_iso_hdlc_neon_v3s4x2e_v2.c
+++ /dev/null
@@ -1,169 +0,0 @@
-/* Generated by https://github.com/corsix/fast-crc32/ using: */
-/* ./generate -i neon -p crc32 -a v3s4x2e_v2 */
-/* Modified slightly post-generation to improve function name and include build target */
-/* MIT licensed */
-
-#include <stddef.h>
-#include <stdint.h>
-#include <arm_acle.h>
-#include <arm_neon.h>
-
-#if defined(_MSC_VER)
-#define CRC_AINLINE static __forceinline
-#define CRC_ALIGN(n) __declspec(align(n))
-#else
-#define CRC_AINLINE static __inline __attribute__((always_inline))
-#define CRC_ALIGN(n) __attribute__((aligned(n)))
-#endif
-#define CRC_EXPORT extern
-
-const char *const ISO_HDLC_TARGET = "aarch64_neon_v3s4x2e_v2";
-
-const char *get_iso_hdlc_target() {
-    return ISO_HDLC_TARGET;
-}
-
-CRC_AINLINE uint64x2_t clmul_lo_e(uint64x2_t a, uint64x2_t b, uint64x2_t c) {
-  uint64x2_t r;
-  __asm("pmull %0.1q, %2.1d, %3.1d\neor %0.16b, %0.16b, %1.16b\n" : "=w"(r), "+w"(c) : "w"(a), "w"(b));
-  return r;
-}
-
-CRC_AINLINE uint64x2_t clmul_hi_e(uint64x2_t a, uint64x2_t b, uint64x2_t c) {
-  uint64x2_t r;
-  __asm("pmull2 %0.1q, %2.2d, %3.2d\neor %0.16b, %0.16b, %1.16b\n" : "=w"(r), "+w"(c) : "w"(a), "w"(b));
-  return r;
-}
-
-CRC_AINLINE uint64x2_t clmul_scalar(uint32_t a, uint32_t b) {
-  uint64x2_t r;
-  __asm("pmull %0.1q, %1.1d, %2.1d\n" : "=w"(r) : "w"(vmovq_n_u64(a)), "w"(vmovq_n_u64(b)));
-  return r;
-}
-
-static uint32_t xnmodp(uint64_t n) /* x^n mod P, in log(n) time */ {
-  uint64_t stack = ~(uint64_t)1;
-  uint32_t acc, low;
-  for (; n > 191; n = (n >> 1) - 16) {
-    stack = (stack << 1) + (n & 1);
-  }
-  stack = ~stack;
-  acc = ((uint32_t)0x80000000) >> (n & 31);
-  for (n >>= 5; n; --n) {
-    acc = __crc32w(acc, 0);
-  }
-  while ((low = stack & 1), stack >>= 1) {
-    poly8x8_t x = vreinterpret_p8_u64(vmov_n_u64(acc));
-    uint64_t y = vgetq_lane_u64(vreinterpretq_u64_p16(vmull_p8(x, x)), 0);
-    acc = __crc32d(0, y << low);
-  }
-  return acc;
-}
-
-CRC_AINLINE uint64x2_t crc_shift(uint32_t crc, size_t nbytes) {
-  return clmul_scalar(crc, xnmodp(nbytes * 8 - 33));
-}
-
-CRC_EXPORT uint32_t crc32_iso_hdlc_impl(uint32_t crc0, const char* buf, size_t len) {
-  crc0 = ~crc0;
-  for (; len && ((uintptr_t)buf & 7); --len) {
-    crc0 = __crc32b(crc0, *buf++);
-  }
-  if (((uintptr_t)buf & 8) && len >= 8) {
-    crc0 = __crc32d(crc0, *(const uint64_t*)buf);
-    buf += 8;
-    len -= 8;
-  }
-  if (len >= 112) {
-    const char* end = buf + len;
-    size_t blk = (len - 0) / 112;
-    size_t klen = blk * 16;
-    const char* buf2 = buf + klen * 4;
-    const char* limit = buf + klen - 32;
-    uint32_t crc1 = 0;
-    uint32_t crc2 = 0;
-    uint32_t crc3 = 0;
-    uint64x2_t vc0;
-    uint64x2_t vc1;
-    uint64x2_t vc2;
-    uint64x2_t vc3;
-    uint64_t vc;
-    /* First vector chunk. */
-    uint64x2_t x0 = vld1q_u64((const uint64_t*)buf2), y0;
-    uint64x2_t x1 = vld1q_u64((const uint64_t*)(buf2 + 16)), y1;
-    uint64x2_t x2 = vld1q_u64((const uint64_t*)(buf2 + 32)), y2;
-    uint64x2_t k;
-    { static const uint64_t CRC_ALIGN(16) k_[] = {0x3db1ecdc, 0xaf449247}; k = vld1q_u64(k_); }
-    buf2 += 48;
-    /* Main loop. */
-    while (buf <= limit) {
-      y0 = clmul_lo_e(x0, k, vld1q_u64((const uint64_t*)buf2)), x0 = clmul_hi_e(x0, k, y0);
-      y1 = clmul_lo_e(x1, k, vld1q_u64((const uint64_t*)(buf2 + 16))), x1 = clmul_hi_e(x1, k, y1);
-      y2 = clmul_lo_e(x2, k, vld1q_u64((const uint64_t*)(buf2 + 32))), x2 = clmul_hi_e(x2, k, y2);
-      crc0 = __crc32d(crc0, *(const uint64_t*)buf);
-      crc1 = __crc32d(crc1, *(const uint64_t*)(buf + klen));
-      crc2 = __crc32d(crc2, *(const uint64_t*)(buf + klen * 2));
-      crc3 = __crc32d(crc3, *(const uint64_t*)(buf + klen * 3));
-      crc0 = __crc32d(crc0, *(const uint64_t*)(buf + 8));
-      crc1 = __crc32d(crc1, *(const uint64_t*)(buf + klen + 8));
-      crc2 = __crc32d(crc2, *(const uint64_t*)(buf + klen * 2 + 8));
-      crc3 = __crc32d(crc3, *(const uint64_t*)(buf + klen * 3 + 8));
-      buf += 16;
-      buf2 += 48;
-    }
-    /* Reduce x0 ... x2 to just x0. */
-    { static const uint64_t CRC_ALIGN(16) k_[] = {0xae689191, 0xccaa009e}; k = vld1q_u64(k_); }
-    y0 = clmul_lo_e(x0, k, x1), x0 = clmul_hi_e(x0, k, y0);
-    x1 = x2;
-    y0 = clmul_lo_e(x0, k, x1), x0 = clmul_hi_e(x0, k, y0);
-    /* Final scalar chunk. */
-    crc0 = __crc32d(crc0, *(const uint64_t*)buf);
-    crc1 = __crc32d(crc1, *(const uint64_t*)(buf + klen));
-    crc2 = __crc32d(crc2, *(const uint64_t*)(buf + klen * 2));
-    crc3 = __crc32d(crc3, *(const uint64_t*)(buf + klen * 3));
-    crc0 = __crc32d(crc0, *(const uint64_t*)(buf + 8));
-    crc1 = __crc32d(crc1, *(const uint64_t*)(buf + klen + 8));
-    crc2 = __crc32d(crc2, *(const uint64_t*)(buf + klen * 2 + 8));
-    crc3 = __crc32d(crc3, *(const uint64_t*)(buf + klen * 3 + 8));
-    vc0 = crc_shift(crc0, klen * 3 + blk * 48);
-    vc1 = crc_shift(crc1, klen * 2 + blk * 48);
-    vc2 = crc_shift(crc2, klen + blk * 48);
-    vc3 = crc_shift(crc3, 0 + blk * 48);
-    vc = vgetq_lane_u64(veorq_u64(veorq_u64(vc0, vc1), veorq_u64(vc2, vc3)), 0);
-    /* Reduce 128 bits to 32 bits, and multiply by x^32. */
-    crc0 = __crc32d(0, vgetq_lane_u64(x0, 0));
-    crc0 = __crc32d(crc0, vc ^ vgetq_lane_u64(x0, 1));
-    buf = buf2;
-    len = end - buf;
-  }
-  if (len >= 32) {
-    /* First vector chunk. */
-    uint64x2_t x0 = vld1q_u64((const uint64_t*)buf), y0;
-    uint64x2_t x1 = vld1q_u64((const uint64_t*)(buf + 16)), y1;
-    uint64x2_t k;
-    { static const uint64_t CRC_ALIGN(16) k_[] = {0xf1da05aa, 0x81256527}; k = vld1q_u64(k_); }
-    x0 = veorq_u64((uint64x2_t){crc0, 0}, x0);
-    buf += 32;
-    len -= 32;
-    /* Main loop. */
-    while (len >= 32) {
-      y0 = clmul_lo_e(x0, k, vld1q_u64((const uint64_t*)buf)), x0 = clmul_hi_e(x0, k, y0);
-      y1 = clmul_lo_e(x1, k, vld1q_u64((const uint64_t*)(buf + 16))), x1 = clmul_hi_e(x1, k, y1);
-      buf += 32;
-      len -= 32;
-    }
-    /* Reduce x0 ... x1 to just x0. */
-    { static const uint64_t CRC_ALIGN(16) k_[] = {0xae689191, 0xccaa009e}; k = vld1q_u64(k_); }
-    y0 = clmul_lo_e(x0, k, x1), x0 = clmul_hi_e(x0, k, y0);
-    /* Reduce 128 bits to 32 bits, and multiply by x^32. */
-    crc0 = __crc32d(0, vgetq_lane_u64(x0, 0));
-    crc0 = __crc32d(crc0, vgetq_lane_u64(x0, 1));
-  }
-  for (; len >= 8; buf += 8, len -= 8) {
-    crc0 = __crc32d(crc0, *(const uint64_t*)buf);
-  }
-  for (; len; --len) {
-    crc0 = __crc32b(crc0, *buf++);
-  }
-  return ~crc0;
-}
diff --git a/include/crc32_iso_hdlc_sse_v4s3x3.c b/include/crc32_iso_hdlc_sse_v4s3x3.c
deleted file mode 100644
index eaa4e60..0000000
--- a/include/crc32_iso_hdlc_sse_v4s3x3.c
+++ /dev/null
@@ -1,278 +0,0 @@
-/* Generated by https://github.com/corsix/fast-crc32/ using: */
-/* ./generate -i sse -p crc32 -a v4s3x3 */
-/* Modified slightly post-generation to improve function name and include build target */
-/* MIT licensed */
-/* Modified for 32-bit compatibility */
-
-#include <stddef.h>
-#include <stdint.h>
-#include <nmmintrin.h>
-#include <wmmintrin.h>
-
-#if defined(_MSC_VER)
-#define CRC_AINLINE static __forceinline
-#define CRC_ALIGN(n) __declspec(align(n))
-#else
-#define CRC_AINLINE static __inline __attribute__((always_inline))
-#define CRC_ALIGN(n) __attribute__((aligned(n)))
-#endif
-#define CRC_EXPORT extern
-
-static const uint32_t g_crc_table[1][256] = {{
-  0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x076dc419, 0x706af48f,
-  0xe963a535, 0x9e6495a3, 0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988,
-  0x09b64c2b, 0x7eb17cbd, 0xe7b82d07, 0x90bf1d91, 0x1db71064, 0x6ab020f2,
-  0xf3b97148, 0x84be41de, 0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7,
-  0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec, 0x14015c4f, 0x63066cd9,
-  0xfa0f3d63, 0x8d080df5, 0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172,
-  0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b, 0x35b5a8fa, 0x42b2986c,
-  0xdbbbc9d6, 0xacbcf940, 0x32d86ce3, 0x45df5c75, 0xdcd60dcf, 0xabd13d59,
-  0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116, 0x21b4f4b5, 0x56b3c423,
-  0xcfba9599, 0xb8bda50f, 0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924,
-  0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d, 0x76dc4190, 0x01db7106,
-  0x98d220bc, 0xefd5102a, 0x71b18589, 0x06b6b51f, 0x9fbfe4a5, 0xe8b8d433,
-  0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818, 0x7f6a0dbb, 0x086d3d2d,
-  0x91646c97, 0xe6635c01, 0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e,
-  0x6c0695ed, 0x1b01a57b, 0x8208f4c1, 0xf50fc457, 0x65b0d9c6, 0x12b7e950,
-  0x8bbeb8ea, 0xfcb9887c, 0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, 0xfbd44c65,
-  0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2, 0x4adfa541, 0x3dd895d7,
-  0xa4d1c46d, 0xd3d6f4fb, 0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0,
-  0x44042d73, 0x33031de5, 0xaa0a4c5f, 0xdd0d7cc9, 0x5005713c, 0x270241aa,
-  0xbe0b1010, 0xc90c2086, 0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f,
-  0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, 0x59b33d17, 0x2eb40d81,
-  0xb7bd5c3b, 0xc0ba6cad, 0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a,
-  0xead54739, 0x9dd277af, 0x04db2615, 0x73dc1683, 0xe3630b12, 0x94643b84,
-  0x0d6d6a3e, 0x7a6a5aa8, 0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1,
-  0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe, 0xf762575d, 0x806567cb,
-  0x196c3671, 0x6e6b06e7, 0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc,
-  0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5, 0xd6d6a3e8, 0xa1d1937e,
-  0x38d8c2c4, 0x4fdff252, 0xd1bb67f1, 0xa6bc5767, 0x3fb506dd, 0x48b2364b,
-  0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60, 0xdf60efc3, 0xa867df55,
-  0x316e8eef, 0x4669be79, 0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236,
-  0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f, 0xc5ba3bbe, 0xb2bd0b28,
-  0x2bb45a92, 0x5cb36a04, 0xc2d7ffa7, 0xb5d0cf31, 0x2cd99e8b, 0x5bdeae1d,
-  0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a, 0x9c0906a9, 0xeb0e363f,
-  0x72076785, 0x05005713, 0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38,
-  0x92d28e9b, 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21, 0x86d3d2d4, 0xf1d4e242,
-  0x68ddb3f8, 0x1fda836e, 0x81be16cd, 0xf6b9265b, 0x6fb077e1, 0x18b74777,
-  0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c, 0x8f659eff, 0xf862ae69,
-  0x616bffd3, 0x166ccf45, 0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2,
-  0xa7672661, 0xd06016f7, 0x4969474d, 0x3e6e77db, 0xaed16a4a, 0xd9d65adc,
-  0x40df0b66, 0x37d83bf0, 0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9,
-  0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, 0xbad03605, 0xcdd70693,
-  0x54de5729, 0x23d967bf, 0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94,
-  0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d
-}};
-
-const char *const ISO_HDLC_TARGET = "x86_sse_v4s3x3";
-
-const char *get_iso_hdlc_target() {
-    return ISO_HDLC_TARGET;
-}
-
-CRC_AINLINE uint32_t crc_u8(uint32_t crc, uint8_t val) {
-  return (crc >> 8) ^ g_crc_table[0][(crc & 0xFF) ^ val];
-}
-
-/* Platform-specific 64-bit handling */
-#if defined(__x86_64__) || defined(_M_X64)
-/* 64-bit platform */
-CRC_AINLINE __m128i mm_cvtsi64_si128(uint64_t val) {
-  return _mm_cvtsi64_si128(val);
-}
-
-CRC_AINLINE uint64_t mm_cvtsi128_si64(__m128i val) {
-  return _mm_cvtsi128_si64(val);
-}
-
-CRC_AINLINE uint64_t mm_extract_epi64(__m128i val, int idx) {
-  /* Even on 64-bit platforms, we need to use constant indices */
-  if (idx == 0) {
-    return _mm_cvtsi128_si64(val);
-  } else {
-    /* For the high 64 bits */
-    return _mm_cvtsi128_si64(_mm_srli_si128(val, 8));
-  }
-}
-#else
-/* 32-bit platform */
-CRC_AINLINE __m128i mm_cvtsi64_si128(uint64_t val) {
-  /* Split 64-bit value into two 32-bit parts for 32-bit platform */
-  __m128i result, temp;
-  result = _mm_cvtsi32_si128((uint32_t)val);  /* Low 32 bits */
-  temp = _mm_cvtsi32_si128((uint32_t)(val >> 32));  /* High 32 bits */
-
-  /* Shift high 32 bits to position 1 */
-  temp = _mm_slli_si128(temp, 4);
-
-  /* Combine low and high parts */
-  result = _mm_or_si128(result, temp);
-  return result;
-}
-
-CRC_AINLINE uint64_t mm_cvtsi128_si64(__m128i val) {
-  /* Combine two 32-bit values into one 64-bit result */
-  uint32_t low = _mm_cvtsi128_si32(val);
-  uint32_t high = _mm_extract_epi32(val, 1);
-  return ((uint64_t)high << 32) | low;
-}
-
-CRC_AINLINE uint64_t mm_extract_epi64(__m128i val, int idx) {
-  /* Extract 64 bits (two 32-bit values) */
-  uint32_t low, high;
-
-  if (idx == 0) {
-    low = _mm_cvtsi128_si32(val);
-    high = _mm_extract_epi32(val, 1);
-  } else {
-    low = _mm_extract_epi32(val, 2);
-    high = _mm_extract_epi32(val, 3);
-  }
-
-  return ((uint64_t)high << 32) | low;
-}
-#endif
-
-CRC_AINLINE uint32_t crc_u64(uint32_t crc, uint64_t val) {
-  __m128i k = _mm_setr_epi32(0xf7011641, 0xb4e5b025, 0xdb710641, 1);
-  __m128i a = mm_cvtsi64_si128(crc ^ val);
-  __m128i b = _mm_clmulepi64_si128(a, k, 0x00);
-  __m128i c = _mm_clmulepi64_si128(b, k, 0x10);
-  return _mm_extract_epi32(c, 2);
-}
-
-#define clmul_lo(a, b) (_mm_clmulepi64_si128((a), (b), 0))
-#define clmul_hi(a, b) (_mm_clmulepi64_si128((a), (b), 17))
-
-CRC_AINLINE __m128i clmul_scalar(uint32_t a, uint32_t b) {
-  return _mm_clmulepi64_si128(_mm_cvtsi32_si128(a), _mm_cvtsi32_si128(b), 0);
-}
-
-CRC_AINLINE uint32_t crc_u32(uint32_t crc, uint32_t val) {
-  __m128i k = _mm_setr_epi32(0x00000000, 0xf7011641, 0xdb710641, 1);
-  __m128i a = _mm_cvtsi32_si128(crc ^ val);
-  __m128i b = _mm_clmulepi64_si128(a, k, 0x00);
-  __m128i c = _mm_clmulepi64_si128(b, k, 0x10);
-  return _mm_extract_epi32(c, 2);
-}
-
-static uint32_t xnmodp(uint64_t n) /* x^n mod P, in log(n) time */ {
-  uint64_t stack = ~(uint64_t)1;
-  uint32_t acc, low;
-  for (; n > 191; n = (n >> 1) - 16) {
-    stack = (stack << 1) + (n & 1);
-  }
-  stack = ~stack;
-  acc = ((uint32_t)0x80000000) >> (n & 31);
-  for (n >>= 5; n; --n) {
-    acc = crc_u32(acc, 0);
-  }
-  while ((low = stack & 1), stack >>= 1) {
-    __m128i x = _mm_cvtsi32_si128(acc);
-    uint64_t y = mm_cvtsi128_si64(_mm_clmulepi64_si128(x, x, 0));
-    acc = crc_u64(0, y << low);
-  }
-  return acc;
-}
-
-CRC_AINLINE __m128i crc_shift(uint32_t crc, size_t nbytes) {
-  return clmul_scalar(crc, xnmodp(nbytes * 8 - 33));
-}
-
-CRC_EXPORT uint32_t crc32_iso_hdlc_impl(uint32_t crc0, const char* buf, size_t len) {
-  crc0 = ~crc0;
-  for (; len && ((uintptr_t)buf & 7); --len) {
-    crc0 = crc_u8(crc0, *buf++);
-  }
-  if (((uintptr_t)buf & 8) && len >= 8) {
-    crc0 = crc_u64(crc0, *(const uint64_t*)buf);
-    buf += 8;
-    len -= 8;
-  }
-  if (len >= 144) {
-    size_t blk = (len - 8) / 136;
-    size_t klen = blk * 24;
-    const char* buf2 = buf + 0;
-    uint32_t crc1 = 0;
-    uint32_t crc2 = 0;
-    __m128i vc0;
-    __m128i vc1;
-    uint64_t vc;
-    /* First vector chunk. */
-    __m128i x0 = _mm_loadu_si128((const __m128i*)buf2), y0;
-    __m128i x1 = _mm_loadu_si128((const __m128i*)(buf2 + 16)), y1;
-    __m128i x2 = _mm_loadu_si128((const __m128i*)(buf2 + 32)), y2;
-    __m128i x3 = _mm_loadu_si128((const __m128i*)(buf2 + 48)), y3;
-    __m128i k;
-    k = _mm_setr_epi32(0x8f352d95, 0, 0x1d9513d7, 0);
-    x0 = _mm_xor_si128(_mm_cvtsi32_si128(crc0), x0);
-    crc0 = 0;
-    buf2 += 64;
-    len -= 136;
-    buf += blk * 64;
-    /* Main loop. */
-    while (len >= 144) {
-      y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
-      y1 = clmul_lo(x1, k), x1 = clmul_hi(x1, k);
-      y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k);
-      y3 = clmul_lo(x3, k), x3 = clmul_hi(x3, k);
-      y0 = _mm_xor_si128(y0, _mm_loadu_si128((const __m128i*)buf2)), x0 = _mm_xor_si128(x0, y0);
-      y1 = _mm_xor_si128(y1, _mm_loadu_si128((const __m128i*)(buf2 + 16))), x1 = _mm_xor_si128(x1, y1);
-      y2 = _mm_xor_si128(y2, _mm_loadu_si128((const __m128i*)(buf2 + 32))), x2 = _mm_xor_si128(x2, y2);
-      y3 = _mm_xor_si128(y3, _mm_loadu_si128((const __m128i*)(buf2 + 48))), x3 = _mm_xor_si128(x3, y3);
-      crc0 = crc_u64(crc0, *(const uint64_t*)buf);
-      crc1 = crc_u64(crc1, *(const uint64_t*)(buf + klen));
-      crc2 = crc_u64(crc2, *(const uint64_t*)(buf + klen * 2));
-      crc0 = crc_u64(crc0, *(const uint64_t*)(buf + 8));
-      crc1 = crc_u64(crc1, *(const uint64_t*)(buf + klen + 8));
-      crc2 = crc_u64(crc2, *(const uint64_t*)(buf + klen * 2 + 8));
-      crc0 = crc_u64(crc0, *(const uint64_t*)(buf + 16));
-      crc1 = crc_u64(crc1, *(const uint64_t*)(buf + klen + 16));
-      crc2 = crc_u64(crc2, *(const uint64_t*)(buf + klen * 2 + 16));
-      buf += 24;
-      buf2 += 64;
-      len -= 136;
-    }
-    /* Reduce x0 ... x3 to just x0. */
-    k = _mm_setr_epi32(0xae689191, 0, 0xccaa009e, 0);
-    y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
-    y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k);
-    y0 = _mm_xor_si128(y0, x1), x0 = _mm_xor_si128(x0, y0);
-    y2 = _mm_xor_si128(y2, x3), x2 = _mm_xor_si128(x2, y2);
-    k = _mm_setr_epi32(0xf1da05aa, 0, 0x81256527, 0);
-    y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
-    y0 = _mm_xor_si128(y0, x2), x0 = _mm_xor_si128(x0, y0);
-    /* Final scalar chunk. */
-    crc0 = crc_u64(crc0, *(const uint64_t*)buf);
-    crc1 = crc_u64(crc1, *(const uint64_t*)(buf + klen));
-    crc2 = crc_u64(crc2, *(const uint64_t*)(buf + klen * 2));
-    crc0 = crc_u64(crc0, *(const uint64_t*)(buf + 8));
-    crc1 = crc_u64(crc1, *(const uint64_t*)(buf + klen + 8));
-    crc2 = crc_u64(crc2, *(const uint64_t*)(buf + klen * 2 + 8));
-    crc0 = crc_u64(crc0, *(const uint64_t*)(buf + 16));
-    crc1 = crc_u64(crc1, *(const uint64_t*)(buf + klen + 16));
-    crc2 = crc_u64(crc2, *(const uint64_t*)(buf + klen * 2 + 16));
-    buf += 24;
-    vc0 = crc_shift(crc0, klen * 2 + 8);
-    vc1 = crc_shift(crc1, klen + 8);
-    vc = mm_extract_epi64(_mm_xor_si128(vc0, vc1), 0);
-    /* Reduce 128 bits to 32 bits, and multiply by x^32. */
-    /* Extract the two 64-bit parts of x0 and combine them */
-    uint64_t x0_low = mm_extract_epi64(x0, 0);
-    uint64_t x0_high = mm_extract_epi64(x0, 1);
-    uint64_t x0_combined = mm_extract_epi64(crc_shift(crc_u64(crc_u64(0, x0_low), x0_high), klen * 3 + 8), 0);
-    vc ^= x0_combined;
-    /* Final 8 bytes. */
-    buf += klen * 2;
-    crc0 = crc2;
-    crc0 = crc_u64(crc0, *(const uint64_t*)buf ^ vc), buf += 8;
-    len -= 8;
-  }
-  for (; len >= 8; buf += 8, len -= 8) {
-    crc0 = crc_u64(crc0, *(const uint64_t*)buf);
-  }
-  for (; len; --len) {
-    crc0 = crc_u8(crc0, *buf++);
-  }
-  return ~crc0;
-}
\ No newline at end of file
diff --git a/src/algorithm.rs b/src/algorithm.rs
index 4082377..20c394b 100644
--- a/src/algorithm.rs
+++ b/src/algorithm.rs
@@ -25,13 +25,9 @@ use crate::{crc32, crc64};
 #[inline]
 #[cfg_attr(
     any(target_arch = "x86", target_arch = "x86_64"),
-    target_feature(enable = "sse2,sse4.1,pclmulqdq")
+    target_feature(enable = "sse3,sse4.1,pclmulqdq")
 )]
-#[cfg_attr(
-    all(target_arch = "x86_64", feature = "vpclmulqdq"),
-    target_feature(enable = "avx2,vpclmulqdq,avx512f,avx512vl")
-)]
-#[cfg_attr(target_arch = "aarch64", target_feature(enable = "neon,aes"))]
+#[cfg_attr(target_arch = "aarch64", target_feature(enable = "aes"))]
 pub unsafe fn update<T: ArchOps, W: EnhancedCrcWidth>(
     state: W::Value,
     bytes: &[u8],
@@ -82,9 +78,9 @@ where
 #[inline]
 #[cfg_attr(
     any(target_arch = "x86", target_arch = "x86_64"),
-    target_feature(enable = "sse2,sse4.1,pclmulqdq")
+    target_feature(enable = "ssse3,sse4.1,pclmulqdq")
 )]
-#[cfg_attr(target_arch = "aarch64", target_feature(enable = "neon,aes"))]
+#[cfg_attr(target_arch = "aarch64", target_feature(enable = "aes"))]
 unsafe fn process_by_strategy<T: ArchOps, W: EnhancedCrcWidth>(
     strategy: DataChunkProcessor,
     data: &[u8],
@@ -118,13 +114,9 @@ where
 #[inline]
 #[cfg_attr(
     any(target_arch = "x86", target_arch = "x86_64"),
-    target_feature(enable = "sse2,sse4.1,pclmulqdq")
-)]
-#[cfg_attr(
-    all(target_arch = "x86_64", feature = "vpclmulqdq"),
-    target_feature(enable = "avx2,vpclmulqdq,avx512f,avx512vl")
+    target_feature(enable = "ssse3,sse4.1,pclmulqdq")
 )]
-#[cfg_attr(target_arch = "aarch64", target_feature(enable = "neon,aes"))]
+#[cfg_attr(target_arch = "aarch64", target_feature(enable = "aes"))]
 unsafe fn process_large_aligned<T: ArchOps, W: EnhancedCrcWidth>(
     bytes: &[u8],
     state: &mut CrcState<T::Vector>,
@@ -175,9 +167,9 @@ where
 #[inline]
 #[cfg_attr(
     any(target_arch = "x86", target_arch = "x86_64"),
-    target_feature(enable = "sse2,sse4.1,pclmulqdq")
+    target_feature(enable = "ssse3,sse4.1,pclmulqdq")
 )]
-#[cfg_attr(target_arch = "aarch64", target_feature(enable = "neon,aes"))]
+#[cfg_attr(target_arch = "aarch64", target_feature(enable = "aes"))]
 unsafe fn process_simd_chunks<T: ArchOps, W: EnhancedCrcWidth>(
     state: &mut CrcState<T::Vector>,
     first: &[T::Vector; 8],
@@ -255,9 +247,9 @@ unsafe fn process_simd_chunks<T: ArchOps, W: EnhancedCrcWidth>(
 #[inline]
 #[cfg_attr(
     any(target_arch = "x86", target_arch = "x86_64"),
-    target_feature(enable = "sse2,sse4.1,pclmulqdq")
+    target_feature(enable = "ssse3,sse4.1,pclmulqdq")
 )]
-#[cfg_attr(target_arch = "aarch64", target_feature(enable = "neon,aes"))]
+#[cfg_attr(target_arch = "aarch64", target_feature(enable = "aes"))]
 unsafe fn process_exactly_16<T: ArchOps, W: EnhancedCrcWidth>(
     data: &[u8],
     state: &mut CrcState<T::Vector>,
@@ -281,9 +273,9 @@ where
 #[inline]
 #[cfg_attr(
     any(target_arch = "x86", target_arch = "x86_64"),
-    target_feature(enable = "sse2,sse4.1,pclmulqdq")
+    target_feature(enable = "ssse3,sse4.1,pclmulqdq")
 )]
-#[cfg_attr(target_arch = "aarch64", target_feature(enable = "neon,aes"))]
+#[cfg_attr(target_arch = "aarch64", target_feature(enable = "aes"))]
 unsafe fn process_16_byte_block<T: ArchOps>(
     data_ptr: *const u8,
     initial_crc: T::Vector,
@@ -304,9 +296,9 @@ where
 #[inline]
 #[cfg_attr(
     any(target_arch = "x86", target_arch = "x86_64"),
-    target_feature(enable = "sse2,sse4.1,pclmulqdq")
+    target_feature(enable = "ssse3,sse4.1,pclmulqdq")
 )]
-#[cfg_attr(target_arch = "aarch64", target_feature(enable = "neon,aes"))]
+#[cfg_attr(target_arch = "aarch64", target_feature(enable = "aes"))]
 pub(crate) unsafe fn reflect_bytes<T: ArchOps>(
     reflector: &Reflector<T::Vector>,
     data: T::Vector,
@@ -325,9 +317,9 @@ where
 #[inline]
 #[cfg_attr(
     any(target_arch = "x86", target_arch = "x86_64"),
-    target_feature(enable = "sse2,sse4.1,pclmulqdq")
+    target_feature(enable = "ssse3,sse4.1,pclmulqdq")
 )]
-#[cfg_attr(target_arch = "aarch64", target_feature(enable = "neon,aes"))]
+#[cfg_attr(target_arch = "aarch64", target_feature(enable = "aes"))]
 unsafe fn fold_and_xor<T: ArchOps, W: EnhancedCrcWidth>(
     current: T::Vector,
     coefficient: T::Vector,
@@ -355,9 +347,9 @@ where
 #[inline]
 #[cfg_attr(
     any(target_arch = "x86", target_arch = "x86_64"),
-    target_feature(enable = "sse2,sse4.1,pclmulqdq")
+    target_feature(enable = "ssse3,sse4.1,pclmulqdq")
 )]
-#[cfg_attr(target_arch = "aarch64", target_feature(enable = "neon,aes"))]
+#[cfg_attr(target_arch = "aarch64", target_feature(enable = "aes"))]
 unsafe fn process_17_to_31<T: ArchOps, W: EnhancedCrcWidth>(
     data: &[u8],
     state: &mut CrcState<T::Vector>,
@@ -394,9 +386,9 @@ where
 #[inline]
 #[cfg_attr(
     any(target_arch = "x86", target_arch = "x86_64"),
-    target_feature(enable = "sse2,sse4.1,pclmulqdq")
+    target_feature(enable = "ssse3,sse4.1,pclmulqdq")
 )]
-#[cfg_attr(target_arch = "aarch64", target_feature(enable = "neon,aes"))]
+#[cfg_attr(target_arch = "aarch64", target_feature(enable = "aes"))]
 unsafe fn process_32_to_255<T: ArchOps, W: EnhancedCrcWidth>(
     data: &[u8],
     state: &mut CrcState<T::Vector>,
@@ -456,9 +448,9 @@ where
 #[inline]
 #[cfg_attr(
     any(target_arch = "x86", target_arch = "x86_64"),
-    target_feature(enable = "sse2,sse4.1,pclmulqdq")
+    target_feature(enable = "ssse3,sse4.1,pclmulqdq")
 )]
-#[cfg_attr(target_arch = "aarch64", target_feature(enable = "neon,aes"))]
+#[cfg_attr(target_arch = "aarch64", target_feature(enable = "aes"))]
 unsafe fn get_last_two_xmms<T: ArchOps, W: EnhancedCrcWidth>(
     data: &[u8],
     remaining_len: usize,
diff --git a/src/arch/aarch64.rs b/src/arch/aarch64.rs
index fc653d4..884ed82 100644
--- a/src/arch/aarch64.rs
+++ b/src/arch/aarch64.rs
@@ -221,7 +221,7 @@ impl ArchOps for AArch64Ops {
     }
 
     #[inline]
-    #[target_feature(enable = "neon,aes")]
+    #[target_feature(enable = "aes")]
     unsafe fn carryless_mul_00(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector {
         vreinterpretq_u8_p128(vmull_p64(
             vgetq_lane_p64(vreinterpretq_p64_u8(a), 0),
@@ -230,7 +230,7 @@ impl ArchOps for AArch64Ops {
     }
 
     #[inline]
-    #[target_feature(enable = "neon,aes")]
+    #[target_feature(enable = "aes")]
     unsafe fn carryless_mul_01(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector {
         // Low 64 bits of a, high 64 bits of b
         let a_low = vgetq_lane_p64(vreinterpretq_p64_u8(a), 1);
@@ -239,7 +239,7 @@ impl ArchOps for AArch64Ops {
     }
 
     #[inline]
-    #[target_feature(enable = "neon,aes")]
+    #[target_feature(enable = "aes")]
     unsafe fn carryless_mul_10(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector {
         vreinterpretq_u8_p128(vmull_p64(
             vgetq_lane_p64(vreinterpretq_p64_u8(a), 0),
@@ -248,7 +248,7 @@ impl ArchOps for AArch64Ops {
     }
 
     #[inline]
-    #[target_feature(enable = "neon,aes")]
+    #[target_feature(enable = "aes")]
     unsafe fn carryless_mul_11(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector {
         vreinterpretq_u8_p128(vmull_p64(
             vgetq_lane_p64(vreinterpretq_p64_u8(a), 1),
@@ -258,7 +258,7 @@ impl ArchOps for AArch64Ops {
 
     #[inline]
     #[cfg(target_feature = "sha3")]
-    #[target_feature(enable = "neon,sha3")]
+    #[target_feature(enable = "sha3")]
     unsafe fn xor3_vectors(
         &self,
         a: Self::Vector,
diff --git a/src/arch/mod.rs b/src/arch/mod.rs
index c2eae65..22849e0 100644
--- a/src/arch/mod.rs
+++ b/src/arch/mod.rs
@@ -5,6 +5,9 @@
 //! It dispatches to the appropriate architecture-specific implementation
 //! based on the target architecture.
 
+#[cfg(target_arch = "aarch64")]
+use std::arch::is_aarch64_feature_detected;
+
 #[cfg(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64"))]
 use crate::algorithm;
 
@@ -14,13 +17,14 @@ use crate::structs::CrcParams;
 use crate::structs::{Width32, Width64};
 
 #[cfg(target_arch = "aarch64")]
-use crate::arch::aarch64::AArch64Ops;
+use aarch64::AArch64Ops;
 
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-use crate::arch::x86::X86Ops;
+use x86::X86Ops;
 
+//#[rustversion::since(1.89)]
 #[cfg(all(target_arch = "x86_64", feature = "vpclmulqdq"))]
-use crate::arch::vpclmulqdq::Vpclmulqdq512Ops;
+use vpclmulqdq::Vpclmulqdq512Ops;
 
 mod aarch64;
 mod software;
@@ -33,84 +37,138 @@ mod x86;
 /// # Safety
 /// May use native CPU features
 #[inline]
-#[cfg_attr(
-    any(target_arch = "x86", target_arch = "x86_64"),
-    target_feature(enable = "sse2,sse4.1,pclmulqdq")
-)]
-#[cfg_attr(
-    all(target_arch = "x86_64", feature = "vpclmulqdq"),
-    target_feature(enable = "avx2,vpclmulqdq,avx512f,avx512vl")
-)]
-#[cfg_attr(target_arch = "aarch64", target_feature(enable = "neon,aes"))]
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "aes")]
 pub(crate) unsafe fn update(state: u64, bytes: &[u8], params: CrcParams) -> u64 {
-    #[cfg(target_arch = "aarch64")]
+    let ops = AArch64Ops;
+
+    match params.width {
+        64 => algorithm::update::<AArch64Ops, Width64>(state, bytes, params, &ops),
+        32 => algorithm::update::<AArch64Ops, Width32>(state as u32, bytes, params, &ops) as u64,
+        _ => panic!("Unsupported CRC width: {}", params.width),
+    }
+}
+
+//#[rustversion::before(1.89)]
+#[inline]
+#[cfg(all(
+    not(feature = "vpclmulqdq"),
+    any(target_arch = "x86", target_arch = "x86_64")
+))]
+#[target_feature(enable = "ssse3,sse4.1,pclmulqdq")]
+pub(crate) unsafe fn update(state: u64, bytes: &[u8], params: CrcParams) -> u64 {
+    update_x86_sse(state, bytes, params)
+}
+
+//#[rustversion::since(1.89)]
+#[inline]
+#[cfg(all(feature = "vpclmulqdq", target_arch = "x86"))]
+#[target_feature(enable = "ssse3,sse4.1,pclmulqdq")]
+pub(crate) unsafe fn update(state: u64, bytes: &[u8], params: CrcParams) -> u64 {
+    update_x86_sse(state, bytes, params)
+}
+
+//#[rustversion::since(1.89)]
+#[inline]
+#[cfg(all(feature = "vpclmulqdq", target_arch = "x86_64"))]
+#[target_feature(enable = "ssse3,sse4.1,pclmulqdq")]
+pub(crate) unsafe fn update(state: u64, bytes: &[u8], params: CrcParams) -> u64 {
+    use std::arch::is_x86_feature_detected;
+
+    if bytes.len() >= 256
+        && is_x86_feature_detected!("vpclmulqdq")
+        && is_x86_feature_detected!("avx512f")
+        && is_x86_feature_detected!("avx512vl")
     {
-        let ops = AArch64Ops;
+        let ops = Vpclmulqdq512Ops::new();
 
-        match params.width {
-            64 => algorithm::update::<AArch64Ops, Width64>(state, bytes, params, &ops),
-            32 => {
-                algorithm::update::<AArch64Ops, Width32>(state as u32, bytes, params, &ops) as u64
-            }
+        return match params.width {
+            64 => algorithm::update::<Vpclmulqdq512Ops, Width64>(state, bytes, params, &ops),
+            32 => algorithm::update::<Vpclmulqdq512Ops, Width32>(state as u32, bytes, params, &ops)
+                as u64,
             _ => panic!("Unsupported CRC width: {}", params.width),
-        }
+        };
     }
 
-    #[cfg(all(target_arch = "x86_64", feature = "vpclmulqdq"))]
-    {
-        use std::arch::is_x86_feature_detected;
-
-        if bytes.len() >= 256 && is_x86_feature_detected!("vpclmulqdq") {
-            let ops = Vpclmulqdq512Ops::new();
-
-            return match params.width {
-                64 => algorithm::update::<Vpclmulqdq512Ops, Width64>(state, bytes, params, &ops),
-                32 => algorithm::update::<Vpclmulqdq512Ops, Width32>(
-                    state as u32,
-                    bytes,
-                    params,
-                    &ops,
-                ) as u64,
-                _ => panic!("Unsupported CRC width: {}", params.width),
-            };
-        }
+    // fallback to the standard x86 SSE implementation
+    update_x86_sse(state, bytes, params)
+}
+
+#[inline]
+#[cfg(all(
+    not(target_arch = "x86"),
+    not(target_arch = "x86_64"),
+    not(target_arch = "aarch64")
+))]
+pub(crate) unsafe fn update(state: u64, bytes: &[u8], params: CrcParams) -> u64 {
+    software::update(state, bytes, params)
+}
+
+#[inline]
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+#[target_feature(enable = "ssse3,sse4.1,pclmulqdq")]
+unsafe fn update_x86_sse(state: u64, bytes: &[u8], params: CrcParams) -> u64 {
+    let ops = X86Ops;
+
+    match params.width {
+        64 => algorithm::update::<X86Ops, Width64>(state, bytes, params, &ops),
+        32 => algorithm::update::<X86Ops, Width32>(state as u32, bytes, params, &ops) as u64,
+        _ => panic!("Unsupported CRC width: {}", params.width),
     }
+}
 
-    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+//#[rustversion::before(1.89)]
+#[cfg(not(feature = "vpclmulqdq"))]
+pub fn get_target() -> String {
+    #[cfg(target_arch = "aarch64")]
     {
-        let ops = X86Ops;
-
-        match params.width {
-            64 => algorithm::update::<X86Ops, Width64>(state, bytes, params, &ops),
-            32 => algorithm::update::<X86Ops, Width32>(state as u32, bytes, params, &ops) as u64,
-            _ => panic!("Unsupported CRC width: {}", params.width),
+        if is_aarch64_feature_detected!("sha3") {
+            return "aarch64-neon-eor3-pclmulqdq".to_string();
         }
+
+        "aarch64-neon-pclmulqdq".to_string()
     }
 
-    #[cfg(not(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64")))]
-    return software::update(state, bytes, params);
+    #[allow(unreachable_code)]
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    return "x86-sse-pclmulqdq".to_string();
+
+    #[cfg(not(any(target_arch = "aarch64", target_arch = "x86", target_arch = "x86_64")))]
+    return "software-fallback-tables".to_string();
 }
 
+//#[rustversion::since(1.89)]
+#[cfg(feature = "vpclmulqdq")]
 pub fn get_target() -> String {
-    #[cfg(all(target_arch = "aarch64", target_feature = "sha3"))]
-    return "internal-aarch64-neon-eor3".to_string();
+    #[cfg(target_arch = "aarch64")]
+    {
+        if is_aarch64_feature_detected!("sha3") {
+            return "aarch64-neon-eor3-pclmulqdq".to_string();
+        }
 
-    #[cfg(all(target_arch = "aarch64", not(target_feature = "sha3")))]
-    return "internal-aarch64-neon".to_string();
+        "aarch64-neon-pclmulqdq".to_string()
+    }
 
-    #[cfg(all(target_arch = "x86_64", feature = "vpclmulqdq"))]
+    #[cfg(target_arch = "x86_64")]
     {
-        if is_x86_feature_detected!("vpclmulqdq") {
-            return "internal-x86_64-avx512-vpclmulqdq".to_string();
+        if is_x86_feature_detected!("vpclmulqdq")
+            && is_x86_feature_detected!("avx512f")
+            && is_x86_feature_detected!("avx512vl")
+        {
+            return "x86_64-avx512-vpclmulqdq".to_string();
+        }
+
+        if is_x86_feature_detected!("avx2") {
+            return "x86_64-avx2-pclmulqdq".to_string();
         }
     }
 
     #[allow(unreachable_code)]
     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-    return "internal-x86-sse-pclmulqdq".to_string();
+    return "x86-sse-pclmulqdq".to_string();
 
     #[cfg(not(any(target_arch = "aarch64", target_arch = "x86", target_arch = "x86_64")))]
-    return "software-fallback".to_string();
+    return "software-fallback-tables".to_string();
 }
 
 #[cfg(test)]
@@ -120,6 +178,7 @@ mod tests {
     use crate::crc64::consts::CRC64_NVME;
     use crate::test::consts::{TEST_256_BYTES_STRING, TEST_ALL_CONFIGS, TEST_CHECK_STRING};
     use crate::test::create_aligned_data;
+    use crate::test::enums::AnyCrcTestConfig;
     use rand::{rng, Rng};
 
     #[test]
@@ -289,100 +348,56 @@ mod tests {
 
     #[test]
     fn test_small_lengths_all() {
-        let mut rng = rng();
-
         // Test each CRC-64 variant
         for config in TEST_ALL_CONFIGS {
             // Test each length from 0 to 255
             for len in 0..=255 {
-                // Generate random data for this length
-                let mut data = vec![0u8; len];
-                rng.fill(&mut data[..]);
-
-                // Calculate expected CRC using the reference implementation
-                let expected = config.checksum_with_reference(&data);
-
-                // direct update() call, which needs XOROUT applied
-                let actual = unsafe {
-                    update(config.get_init(), &data, *config.get_params()) ^ config.get_xorout()
-                };
-
-                assert_eq!(
-                    actual,
-                    expected,
-                    "\nFailed for {} with length {}\nGot: {:016x}\nExpected: {:016x}",
-                    config.get_name(),
-                    len,
-                    actual,
-                    expected
-                );
+                test_length(len, config);
             }
         }
     }
 
     #[test]
     fn test_medium_lengths() {
-        let mut rng = rng();
-
         // Test each CRC-64 variant
         for config in TEST_ALL_CONFIGS {
             // Test each length from 256 to 1024, which should fold and include handling remainders
             for len in 256..=1024 {
-                // Generate random data for this length
-                let mut data = vec![0u8; len];
-                rng.fill(&mut data[..]);
-
-                // Calculate expected CRC using the reference implementation
-                let expected = config.checksum_with_reference(&data);
-
-                // direct update() call, which needs XOROUT applied
-                let actual = unsafe {
-                    update(config.get_init(), &data, *config.get_params()) ^ config.get_xorout()
-                };
-
-                assert_eq!(
-                    actual,
-                    expected,
-                    "\nFailed for {} with length {}\nGot: {:016x}\nExpected: {:016x}",
-                    config.get_name(),
-                    len,
-                    actual,
-                    expected
-                );
+                test_length(len, config);
             }
         }
     }
 
     #[test]
     fn test_large_lengths() {
-        let mut rng = rng();
-
         // Test each CRC-64 variant
         for config in TEST_ALL_CONFIGS {
             // Test ~1 MiB just before, at, and just after the folding boundaries
             for len in 1048575..=1048577 {
-                // Generate random data for this length
-                let mut data = vec![0u8; len];
-                rng.fill(&mut data[..]);
-
-                // Calculate expected CRC using the reference implementation
-                let expected = config.checksum_with_reference(&data);
-
-                // direct update() call, which needs XOROUT applied
-                let actual = unsafe {
-                    update(config.get_init(), &data, *config.get_params()) ^ config.get_xorout()
-                };
-
-                assert_eq!(
-                    actual,
-                    expected,
-                    "\nFailed for {} with length {}\nGot: {:016x}\nExpected: {:016x}",
-                    config.get_name(),
-                    len,
-                    actual,
-                    expected
-                );
+                test_length(len, config);
             }
         }
     }
+
+    fn test_length(length: usize, config: &AnyCrcTestConfig) {
+        let mut data = vec![0u8; length];
+        rng().fill(&mut data[..]);
+
+        // Calculate expected CRC using the reference implementation
+        let expected = config.checksum_with_reference(&data);
+
+        // direct update() call, which needs XOROUT applied
+        let actual =
+            unsafe { update(config.get_init(), &data, *config.get_params()) ^ config.get_xorout() };
+
+        assert_eq!(
+            actual,
+            expected,
+            "\nFailed for {} with length {}\nGot: {:016x}\nExpected: {:016x}",
+            config.get_name(),
+            length,
+            actual,
+            expected
+        );
+    }
 }
diff --git a/src/arch/vpclmulqdq.rs b/src/arch/vpclmulqdq.rs
index 9fbc97c..515f802 100644
--- a/src/arch/vpclmulqdq.rs
+++ b/src/arch/vpclmulqdq.rs
@@ -6,18 +6,31 @@
 
 #![cfg(all(target_arch = "x86_64", feature = "vpclmulqdq"))]
 
+//#[rustversion::since(1.89)]
 use crate::arch::x86::X86Ops;
+
+//#[rustversion::since(1.89)]
 use crate::enums::Reflector;
+
+//#[rustversion::since(1.89)]
 use crate::structs::CrcState;
+
+//#[rustversion::since(1.89)]
 use crate::traits::{ArchOps, EnhancedCrcWidth};
+
+//#[rustversion::since(1.89)]
 use std::arch::x86_64::*;
+
+//#[rustversion::since(1.89)]
 use std::ops::BitXor;
 
 /// Implements the ArchOps trait using 512-bit AVX-512 and VPCLMULQDQ instructions at 512 bits.
 /// Delegates to X86Ops for standard 128-bit operations
+//#[rustversion::since(1.89)]
 #[derive(Debug, Copy, Clone)]
 pub struct Vpclmulqdq512Ops(X86Ops);
 
+//#[rustversion::since(1.89)]
 impl Vpclmulqdq512Ops {
     #[inline(always)]
     pub fn new() -> Self {
@@ -26,9 +39,11 @@ impl Vpclmulqdq512Ops {
 }
 
 // Wrapper for __m512i to make it easier to work with
+//#[rustversion::since(1.89)]
 #[derive(Debug, Copy, Clone)]
 struct Simd512(__m512i);
 
+//#[rustversion::since(1.89)]
 impl Simd512 {
     #[inline]
     #[target_feature(enable = "avx512f")]
@@ -97,15 +112,14 @@ impl Simd512 {
     }
 }
 
+//#[rustversion::since(1.89)]
 impl Vpclmulqdq512Ops {
     /// Process aligned blocks using VPCLMULQDQ with 4 x 512-bit registers
     ///
     /// Note that #[inline(always)] loses the inlining performance boost, despite no native
     /// target_features being used directly. Odd since that's not how Rust's docs make it sound...
     #[inline]
-    #[target_feature(
-        enable = "avx,avx2,avx512f,avx512vl,avx512bw,vpclmulqdq,sse,sse2,sse4.1,pclmulqdq"
-    )]
+    #[target_feature(enable = "ssse3,avx2,avx512f,avx512vl,avx512bw,vpclmulqdq,pclmulqdq")]
     unsafe fn process_blocks<W: EnhancedCrcWidth>(
         &self,
         state: &mut CrcState<<Vpclmulqdq512Ops as ArchOps>::Vector>,
@@ -325,6 +339,7 @@ impl Vpclmulqdq512Ops {
 }
 
 // 512-bit version of the Reflector
+//#[rustversion::since(1.89)]
 #[derive(Clone, Copy)]
 enum Reflector512 {
     NoReflector,
@@ -332,6 +347,7 @@ enum Reflector512 {
 }
 
 // Function to create the appropriate reflector based on CRC parameters
+//#[rustversion::since(1.89)]
 #[inline(always)]
 unsafe fn create_reflector512(reflected: bool) -> Reflector512 {
     if reflected {
@@ -353,6 +369,7 @@ unsafe fn create_reflector512(reflected: bool) -> Reflector512 {
 }
 
 // Function to apply reflection to a 512-bit vector
+//#[rustversion::since(1.89)]
 #[inline(always)]
 unsafe fn reflect_bytes512(reflector: &Reflector512, data: Simd512) -> Simd512 {
     match reflector {
@@ -362,10 +379,12 @@ unsafe fn reflect_bytes512(reflector: &Reflector512, data: Simd512) -> Simd512 {
 }
 
 // pre-compute the reverse indices for 512-bit shuffling
+//#[rustversion::since(1.89)]
 static REVERSE_INDICES_512: __m512i =
     unsafe { std::mem::transmute([7u64, 6u64, 5u64, 4u64, 3u64, 2u64, 1u64, 0u64]) };
 
 // Implement a 512-bit byte shuffle function
+//#[rustversion::since(1.89)]
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
 unsafe fn shuffle_bytes512(data: Simd512, mask: Simd512) -> Simd512 {
@@ -377,6 +396,7 @@ unsafe fn shuffle_bytes512(data: Simd512, mask: Simd512) -> Simd512 {
 }
 
 // Delegate all ArchOps methods to the inner X86Ops instance
+//#[rustversion::since(1.89)]
 impl ArchOps for Vpclmulqdq512Ops {
     type Vector = __m128i;
 
@@ -405,7 +425,7 @@ impl ArchOps for Vpclmulqdq512Ops {
 
     // Delegate all other methods to X86Ops
     #[inline]
-    #[target_feature(enable = "sse2,sse4.1")]
+    #[target_feature(enable = "sse2")]
     unsafe fn create_vector_from_u64_pair(
         &self,
         high: u64,
@@ -416,7 +436,7 @@ impl ArchOps for Vpclmulqdq512Ops {
     }
 
     #[inline]
-    #[target_feature(enable = "sse2,sse4.1")]
+    #[target_feature(enable = "sse2")]
     unsafe fn create_vector_from_u64_pair_non_reflected(
         &self,
         high: u64,
@@ -426,49 +446,49 @@ impl ArchOps for Vpclmulqdq512Ops {
     }
 
     #[inline]
-    #[target_feature(enable = "sse2,sse4.1")]
+    #[target_feature(enable = "sse4.1")]
     unsafe fn create_vector_from_u64(&self, value: u64, high: bool) -> Self::Vector {
         self.0.create_vector_from_u64(value, high)
     }
 
     #[inline]
-    #[target_feature(enable = "sse2,sse4.1")]
+    #[target_feature(enable = "sse2")]
     unsafe fn extract_u64s(&self, vector: Self::Vector) -> [u64; 2] {
         self.0.extract_u64s(vector)
     }
 
     #[inline]
-    #[target_feature(enable = "sse2,sse4.1")]
+    #[target_feature(enable = "sse2")]
     unsafe fn extract_poly64s(&self, vector: Self::Vector) -> [u64; 2] {
         self.0.extract_poly64s(vector)
     }
 
     #[inline]
-    #[target_feature(enable = "sse2,sse4.1")]
+    #[target_feature(enable = "sse2")]
     unsafe fn xor_vectors(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector {
         self.0.xor_vectors(a, b)
     }
 
     #[inline]
-    #[target_feature(enable = "sse2,sse4.1")]
+    #[target_feature(enable = "sse2")]
     unsafe fn load_bytes(&self, ptr: *const u8) -> Self::Vector {
         self.0.load_bytes(ptr)
     }
 
     #[inline]
-    #[target_feature(enable = "sse2,sse4.1")]
+    #[target_feature(enable = "sse2")]
     unsafe fn load_aligned(&self, ptr: *const [u64; 2]) -> Self::Vector {
         self.0.load_aligned(ptr)
     }
 
     #[inline]
-    #[target_feature(enable = "sse2,sse4.1")]
+    #[target_feature(enable = "ssse3")]
     unsafe fn shuffle_bytes(&self, data: Self::Vector, mask: Self::Vector) -> Self::Vector {
         self.0.shuffle_bytes(data, mask)
     }
 
     #[inline]
-    #[target_feature(enable = "sse2,sse4.1")]
+    #[target_feature(enable = "sse4.1")]
     unsafe fn blend_vectors(
         &self,
         a: Self::Vector,
@@ -479,115 +499,115 @@ impl ArchOps for Vpclmulqdq512Ops {
     }
 
     #[inline]
-    #[target_feature(enable = "sse2,sse4.1")]
+    #[target_feature(enable = "sse2")]
     unsafe fn shift_left_8(&self, vector: Self::Vector) -> Self::Vector {
         self.0.shift_left_8(vector)
     }
 
     #[inline]
-    #[target_feature(enable = "sse2,sse4.1")]
+    #[target_feature(enable = "sse2")]
     unsafe fn set_all_bytes(&self, value: u8) -> Self::Vector {
         self.0.set_all_bytes(value)
     }
 
     #[inline]
-    #[target_feature(enable = "sse2,sse4.1")]
+    #[target_feature(enable = "sse2")]
     unsafe fn create_compare_mask(&self, vector: Self::Vector) -> Self::Vector {
         self.0.create_compare_mask(vector)
     }
 
     #[inline]
-    #[target_feature(enable = "sse2,sse4.1")]
+    #[target_feature(enable = "sse2")]
     unsafe fn and_vectors(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector {
         self.0.and_vectors(a, b)
     }
 
     #[inline]
-    #[target_feature(enable = "sse2,sse4.1")]
+    #[target_feature(enable = "sse2")]
     unsafe fn shift_right_32(&self, vector: Self::Vector) -> Self::Vector {
         self.0.shift_right_32(vector)
     }
 
     #[inline]
-    #[target_feature(enable = "sse2,sse4.1")]
+    #[target_feature(enable = "sse2")]
     unsafe fn shift_left_32(&self, vector: Self::Vector) -> Self::Vector {
         self.0.shift_left_32(vector)
     }
 
     #[inline]
-    #[target_feature(enable = "sse2,sse4.1")]
+    #[target_feature(enable = "sse4.1")]
     unsafe fn create_vector_from_u32(&self, value: u32, high: bool) -> Self::Vector {
         self.0.create_vector_from_u32(value, high)
     }
 
     #[inline]
-    #[target_feature(enable = "sse2,sse4.1")]
+    #[target_feature(enable = "sse2")]
     unsafe fn shift_left_4(&self, vector: Self::Vector) -> Self::Vector {
         self.0.shift_left_4(vector)
     }
 
     #[inline]
-    #[target_feature(enable = "sse2,sse4.1")]
+    #[target_feature(enable = "sse2")]
     unsafe fn shift_right_4(&self, vector: Self::Vector) -> Self::Vector {
         self.0.shift_right_4(vector)
     }
 
     #[inline]
-    #[target_feature(enable = "sse2,sse4.1")]
+    #[target_feature(enable = "sse2")]
     unsafe fn shift_right_8(&self, vector: Self::Vector) -> Self::Vector {
         self.0.shift_right_8(vector)
     }
 
     #[inline]
-    #[target_feature(enable = "sse2,sse4.1")]
+    #[target_feature(enable = "sse2")]
     unsafe fn shift_right_5(&self, vector: Self::Vector) -> Self::Vector {
         self.0.shift_right_5(vector)
     }
 
     #[inline]
-    #[target_feature(enable = "sse2,sse4.1")]
+    #[target_feature(enable = "sse2")]
     unsafe fn shift_right_6(&self, vector: Self::Vector) -> Self::Vector {
         self.0.shift_right_6(vector)
     }
 
     #[inline]
-    #[target_feature(enable = "sse2,sse4.1")]
+    #[target_feature(enable = "sse2")]
     unsafe fn shift_right_7(&self, vector: Self::Vector) -> Self::Vector {
         self.0.shift_right_7(vector)
     }
 
     #[inline]
-    #[target_feature(enable = "sse2,sse4.1")]
+    #[target_feature(enable = "sse2")]
     unsafe fn shift_right_12(&self, vector: Self::Vector) -> Self::Vector {
         self.0.shift_right_12(vector)
     }
 
     #[inline]
-    #[target_feature(enable = "sse2,sse4.1")]
+    #[target_feature(enable = "sse2")]
     unsafe fn shift_left_12(&self, vector: Self::Vector) -> Self::Vector {
         self.0.shift_left_12(vector)
     }
 
     #[inline]
-    #[target_feature(enable = "sse2,sse4.1,pclmulqdq")]
+    #[target_feature(enable = "pclmulqdq")]
     unsafe fn carryless_mul_00(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector {
         self.0.carryless_mul_00(a, b)
     }
 
     #[inline]
-    #[target_feature(enable = "sse2,sse4.1,pclmulqdq")]
+    #[target_feature(enable = "pclmulqdq")]
     unsafe fn carryless_mul_01(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector {
         self.0.carryless_mul_01(a, b)
     }
 
     #[inline]
-    #[target_feature(enable = "sse2,sse4.1,pclmulqdq")]
+    #[target_feature(enable = "pclmulqdq")]
     unsafe fn carryless_mul_10(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector {
         self.0.carryless_mul_10(a, b)
     }
 
     #[inline]
-    #[target_feature(enable = "sse2,sse4.1,pclmulqdq")]
+    #[target_feature(enable = "pclmulqdq")]
     unsafe fn carryless_mul_11(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector {
         self.0.carryless_mul_11(a, b)
     }
diff --git a/src/arch/x86.rs b/src/arch/x86.rs
index acd3db5..3bf635f 100644
--- a/src/arch/x86.rs
+++ b/src/arch/x86.rs
@@ -23,7 +23,7 @@ impl ArchOps for X86Ops {
     type Vector = __m128i;
 
     #[inline]
-    #[target_feature(enable = "sse2,sse4.1")]
+    #[target_feature(enable = "sse2")]
     unsafe fn create_vector_from_u64_pair(
         &self,
         high: u64,
@@ -39,7 +39,7 @@ impl ArchOps for X86Ops {
     }
 
     #[inline]
-    #[target_feature(enable = "sse2,sse4.1")]
+    #[target_feature(enable = "sse2")]
     unsafe fn create_vector_from_u64_pair_non_reflected(
         &self,
         high: u64,
@@ -50,54 +50,54 @@ impl ArchOps for X86Ops {
     }
 
     #[inline]
-    #[target_feature(enable = "sse2,sse4.1")]
+    #[target_feature(enable = "sse4.1")]
     unsafe fn create_vector_from_u64(&self, value: u64, high: bool) -> Self::Vector {
         // x86 uses custom helper
         self.create_u64_vector(value, high)
     }
 
     #[inline]
-    #[target_feature(enable = "sse2,sse4.1")]
+    #[target_feature(enable = "sse2")]
     unsafe fn extract_u64s(&self, vector: Self::Vector) -> [u64; 2] {
         [self.extract_u64_low(vector), self.extract_u64_high(vector)]
     }
 
     #[inline]
-    #[target_feature(enable = "sse2,sse4.1")]
+    #[target_feature(enable = "sse2")]
     unsafe fn extract_poly64s(&self, vector: Self::Vector) -> [u64; 2] {
         // On x86, poly64s and u64s extraction is the same
         self.extract_u64s(vector)
     }
 
     #[inline]
-    #[target_feature(enable = "sse2,sse4.1")]
+    #[target_feature(enable = "sse2")]
     unsafe fn xor_vectors(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector {
         _mm_xor_si128(a, b)
     }
 
     #[inline]
-    #[target_feature(enable = "sse2,sse4.1")]
+    #[target_feature(enable = "sse2")]
     unsafe fn load_bytes(&self, ptr: *const u8) -> Self::Vector {
         // x86 requires cast to __m128i*
         _mm_loadu_si128(ptr as *const __m128i)
     }
 
     #[inline]
-    #[target_feature(enable = "sse2,sse4.1")]
+    #[target_feature(enable = "sse2")]
     unsafe fn load_aligned(&self, ptr: *const [u64; 2]) -> Self::Vector {
         // x86 requires cast to __m128i*
         _mm_loadu_si128(ptr as *const __m128i)
     }
 
     #[inline]
-    #[target_feature(enable = "sse2,sse4.1,ssse3")]
+    #[target_feature(enable = "ssse3")]
     unsafe fn shuffle_bytes(&self, data: Self::Vector, mask: Self::Vector) -> Self::Vector {
         // x86 uses specific SSSE3 instruction
         _mm_shuffle_epi8(data, mask)
     }
 
     #[inline]
-    #[target_feature(enable = "sse2,sse4.1")]
+    #[target_feature(enable = "sse4.1")]
     unsafe fn blend_vectors(
         &self,
         a: Self::Vector,
@@ -109,14 +109,14 @@ impl ArchOps for X86Ops {
     }
 
     #[inline]
-    #[target_feature(enable = "sse2,sse4.1")]
+    #[target_feature(enable = "sse2")]
     unsafe fn shift_left_8(&self, vector: Self::Vector) -> Self::Vector {
         // x86 has a dedicated shift instruction
         _mm_slli_si128(vector, 8)
     }
 
     #[inline]
-    #[target_feature(enable = "sse2,sse4.1")]
+    #[target_feature(enable = "sse2")]
     unsafe fn set_all_bytes(&self, value: u8) -> Self::Vector {
         _mm_set1_epi8(value as i8)
     }
@@ -128,25 +128,25 @@ impl ArchOps for X86Ops {
     }
 
     #[inline]
-    #[target_feature(enable = "sse2,sse4.1")]
+    #[target_feature(enable = "sse2")]
     unsafe fn and_vectors(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector {
         _mm_and_si128(a, b)
     }
 
     #[inline]
-    #[target_feature(enable = "sse2,sse4.1")]
+    #[target_feature(enable = "sse2")]
     unsafe fn shift_right_32(&self, vector: Self::Vector) -> Self::Vector {
         _mm_srli_si128(vector, 4)
     }
 
     #[inline]
-    #[target_feature(enable = "sse2,sse4.1")]
+    #[target_feature(enable = "sse2")]
     unsafe fn shift_left_32(&self, vector: Self::Vector) -> Self::Vector {
         _mm_slli_si128(vector, 4)
     }
 
     #[inline]
-    #[target_feature(enable = "sse2,sse4.1")]
+    #[target_feature(enable = "sse4.1")]
     unsafe fn create_vector_from_u32(&self, value: u32, high: bool) -> Self::Vector {
         if high {
             _mm_insert_epi32(_mm_set1_epi32(0), value as i32, 3)
@@ -156,79 +156,80 @@ impl ArchOps for X86Ops {
     }
 
     #[inline]
-    #[target_feature(enable = "sse2,sse4.1")]
+    #[target_feature(enable = "sse2")]
     unsafe fn shift_left_4(&self, vector: Self::Vector) -> Self::Vector {
         _mm_slli_si128(vector, 4)
     }
 
     #[inline]
-    #[target_feature(enable = "sse2,sse4.1")]
+    #[target_feature(enable = "sse2")]
     unsafe fn shift_right_4(&self, vector: Self::Vector) -> Self::Vector {
         _mm_srli_si128(vector, 4)
     }
 
     #[inline]
-    #[target_feature(enable = "sse2,sse4.1")]
+    #[target_feature(enable = "sse2")]
     unsafe fn shift_right_8(&self, vector: Self::Vector) -> Self::Vector {
         _mm_srli_si128(vector, 8)
     }
 
     #[inline]
-    #[target_feature(enable = "sse2,sse4.1")]
+    #[target_feature(enable = "sse2")]
     unsafe fn shift_right_5(&self, vector: Self::Vector) -> Self::Vector {
         _mm_srli_si128(vector, 5)
     }
 
     #[inline]
-    #[target_feature(enable = "sse2,sse4.1")]
+    #[target_feature(enable = "sse2")]
     unsafe fn shift_right_6(&self, vector: Self::Vector) -> Self::Vector {
         _mm_srli_si128(vector, 6)
     }
 
     #[inline]
-    #[target_feature(enable = "sse2,sse4.1")]
+    #[target_feature(enable = "sse2")]
     unsafe fn shift_right_7(&self, vector: Self::Vector) -> Self::Vector {
         _mm_srli_si128(vector, 7)
     }
 
     #[inline]
-    #[target_feature(enable = "sse2,sse4.1")]
+    #[target_feature(enable = "sse2")]
     unsafe fn shift_right_12(&self, vector: Self::Vector) -> Self::Vector {
         _mm_srli_si128(vector, 12)
     }
 
     #[inline]
-    #[target_feature(enable = "sse2,sse4.1")]
+    #[target_feature(enable = "sse2")]
     unsafe fn shift_left_12(&self, vector: Self::Vector) -> Self::Vector {
         _mm_slli_si128(vector, 12)
     }
 
     #[inline]
-    #[target_feature(enable = "sse2,sse4.1,pclmulqdq")]
+    #[target_feature(enable = "pclmulqdq")]
     unsafe fn carryless_mul_00(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector {
         _mm_clmulepi64_si128(a, b, 0x00)
     }
 
     #[inline]
-    #[target_feature(enable = "sse2,sse4.1,pclmulqdq")]
+    #[target_feature(enable = "pclmulqdq")]
     unsafe fn carryless_mul_01(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector {
         _mm_clmulepi64_si128(a, b, 0x01)
     }
 
     #[inline]
-    #[target_feature(enable = "sse2,sse4.1,pclmulqdq")]
+    #[target_feature(enable = "pclmulqdq")]
     unsafe fn carryless_mul_10(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector {
         _mm_clmulepi64_si128(a, b, 0x10)
     }
 
     #[inline]
-    #[target_feature(enable = "sse2,sse4.1,pclmulqdq")]
+    #[target_feature(enable = "pclmulqdq")]
     unsafe fn carryless_mul_11(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector {
         _mm_clmulepi64_si128(a, b, 0x11)
     }
 
+    //#[rustversion::since(1.89)]
     #[inline]
-    #[cfg(any(feature = "vpclmulqdq", feature = "avx512"))]
+    #[cfg(feature = "vpclmulqdq")]
     #[target_feature(enable = "avx512f,avx512vl")]
     unsafe fn xor3_vectors(
         &self,
@@ -236,29 +237,31 @@ impl ArchOps for X86Ops {
         b: Self::Vector,
         c: Self::Vector,
     ) -> Self::Vector {
-        _mm_ternarylogic_epi64(
-            a, b, c, 0x96, // XOR3
-        )
+        if is_x86_feature_detected!("avx512f") && is_x86_feature_detected!("avx512vl") {
+            return self.xor3_vectors_avx512(a, b, c);
+        }
+
+        self.xor3_vectors_sse(a, b, c)
     }
 
+    //#[rustversion::before(1.89)]
     #[inline]
-    #[cfg(not(any(feature = "vpclmulqdq", feature = "avx512")))]
-    #[target_feature(enable = "sse2,sse4.1")]
+    #[cfg(not(feature = "vpclmulqdq"))]
+    #[target_feature(enable = "sse4.1")]
     unsafe fn xor3_vectors(
         &self,
         a: Self::Vector,
         b: Self::Vector,
         c: Self::Vector,
     ) -> Self::Vector {
-        // x86 doesn't have native XOR3 in SSE, use two XORs
-        _mm_xor_si128(_mm_xor_si128(a, b), c)
+        self.xor3_vectors_sse(a, b, c)
     }
 }
 
 impl X86Ops {
     // Helper methods specific to x86/x86_64
     #[inline]
-    #[target_feature(enable = "sse2,sse4.1")]
+    #[target_feature(enable = "sse2")]
     unsafe fn set_epi64x(&self, e1: u64, e0: u64) -> __m128i {
         #[cfg(target_arch = "x86_64")]
         {
@@ -277,7 +280,7 @@ impl X86Ops {
     }
 
     #[inline]
-    #[target_feature(enable = "sse2,sse4.1")]
+    #[target_feature(enable = "sse4.1")]
     unsafe fn create_u64_vector(&self, value: u64, high: bool) -> __m128i {
         if high {
             self.set_epi64x(value, 0)
@@ -287,7 +290,7 @@ impl X86Ops {
     }
 
     #[inline]
-    #[target_feature(enable = "sse2,sse4.1")]
+    #[target_feature(enable = "sse2")]
     unsafe fn extract_u64_low(&self, v: __m128i) -> u64 {
         #[cfg(target_arch = "x86_64")]
         {
@@ -303,7 +306,7 @@ impl X86Ops {
     }
 
     #[inline]
-    #[target_feature(enable = "sse2,sse4.1")]
+    #[target_feature(enable = "sse2")]
     unsafe fn extract_u64_high(&self, v: __m128i) -> u64 {
         #[cfg(target_arch = "x86_64")]
         {
@@ -317,4 +320,21 @@ impl X86Ops {
             lo | (hi << 32)
         }
     }
+
+    //#[rustversion::since(1.89)]
+    #[inline]
+    #[cfg(feature = "vpclmulqdq")]
+    #[target_feature(enable = "avx512f,avx512vl")]
+    unsafe fn xor3_vectors_avx512(&self, a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+        _mm_ternarylogic_epi64(
+            a, b, c, 0x96, // XOR3
+        )
+    }
+
+    #[inline]
+    #[target_feature(enable = "sse4.1")]
+    unsafe fn xor3_vectors_sse(&self, a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+        // x86 doesn't have native XOR3 in SSE, use two XORs
+        _mm_xor_si128(_mm_xor_si128(a, b), c)
+    }
 }
diff --git a/src/bin/arch-check.rs b/src/bin/arch-check.rs
index da55e65..0e7cb66 100644
--- a/src/bin/arch-check.rs
+++ b/src/bin/arch-check.rs
@@ -3,11 +3,12 @@
 #[cfg(target_arch = "aarch64")]
 use std::arch::is_aarch64_feature_detected;
 
-use crc_fast::get_calculator_target;
-use crc_fast::CrcAlgorithm::{Crc32Iscsi, Crc32IsoHdlc, Crc64Nvme};
 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
 use std::arch::is_x86_feature_detected;
 
+use crc_fast::get_calculator_target;
+use crc_fast::CrcAlgorithm::{Crc32Iscsi, Crc32IsoHdlc, Crc64Nvme};
+
 fn main() {
     // Check the target architecture and call the appropriate function
     #[cfg(target_arch = "aarch64")]
diff --git a/src/bindings/crc32_iscsi.rs b/src/bindings/crc32_iscsi.rs
deleted file mode 100644
index 848d748..0000000
--- a/src/bindings/crc32_iscsi.rs
+++ /dev/null
@@ -1,13 +0,0 @@
-/* automatically generated by rust-bindgen 0.70.1 */
-
-extern "C" {
-    pub static ISCSI_TARGET: *const ::std::os::raw::c_char;
-}
-extern "C" {
-    #[doc = " Gets the target build properties (CPU architecture and fine-tuning parameters) for this implementation."]
-    pub fn get_iscsi_target() -> *const ::std::os::raw::c_char;
-}
-extern "C" {
-    #[doc = " Calculate CRC-32/ISCSI checksum using hardware acceleration\n\n @param crc0 Initial CRC value (typically 0)\n @param buf Pointer to input data buffer\n @param len Length of input data in bytes\n\n @return Calculated CRC-32/ISCSI checksum"]
-    pub fn crc32_iscsi_impl(crc0: u32, buf: *const ::std::os::raw::c_char, len: usize) -> u32;
-}
diff --git a/src/bindings/crc32_iso_hdlc.rs b/src/bindings/crc32_iso_hdlc.rs
deleted file mode 100644
index 25de98d..0000000
--- a/src/bindings/crc32_iso_hdlc.rs
+++ /dev/null
@@ -1,13 +0,0 @@
-/* automatically generated by rust-bindgen 0.70.1 */
-
-extern "C" {
-    pub static ISO_HDLC_TARGET: *const ::std::os::raw::c_char;
-}
-extern "C" {
-    #[doc = " Gets the target build properties (CPU architecture and fine-tuning parameters) for this implementation."]
-    pub fn get_iso_hdlc_target() -> *const ::std::os::raw::c_char;
-}
-extern "C" {
-    #[doc = " Calculate CRC-32/ISO_HDLC checksum using hardware acceleration\n\n @param crc0 Initial CRC value (typically 0)\n @param buf Pointer to input data buffer\n @param len Length of input data in bytes\n\n @return Calculated CRC-32/ISO_HDLC checksum"]
-    pub fn crc32_iso_hdlc_impl(crc0: u32, buf: *const ::std::os::raw::c_char, len: usize) -> u32;
-}
diff --git a/src/bindings/mod.rs b/src/bindings/mod.rs
deleted file mode 100644
index 50df665..0000000
--- a/src/bindings/mod.rs
+++ /dev/null
@@ -1,63 +0,0 @@
-// Copyright 2025 Don MacAskill. Licensed under MIT or Apache-2.0.
-
-//! This module provides bindings to the C implementations of CRC32 algorithms.
-
-#![allow(non_upper_case_globals)]
-#![allow(non_camel_case_types)]
-#![allow(non_snake_case)]
-#![allow(unused)]
-
-use crate::structs::CrcParams;
-use std::ffi::CStr;
-use std::os::raw::c_char;
-
-mod crc32_iscsi;
-mod crc32_iso_hdlc;
-
-// note that the initial state needs to be reversed
-#[inline(always)]
-pub(crate) fn crc32_iso_hdlc(state: u64, data: &[u8], params: CrcParams) -> u64 {
-    unsafe {
-        // TODO: Examine the C implementation and see why we have to invert the state...
-        crc32_iso_hdlc::crc32_iso_hdlc_impl(
-            !state as u32,
-            data.as_ptr() as *const c_char,
-            data.len(),
-        ) as u64
-            ^ params.xorout
-    }
-}
-
-// note that the initial state needs to be reversed
-#[inline(always)]
-pub(crate) fn crc32_iscsi(state: u64, data: &[u8], params: CrcParams) -> u64 {
-    unsafe {
-        // TODO: Examine the C implementation and see why we have to invert the state...
-        crc32_iscsi::crc32_iscsi_impl(!state as u32, data.as_ptr() as *const c_char, data.len())
-            as u64
-            ^ params.xorout
-    }
-}
-
-#[allow(unused)]
-pub unsafe fn get_iso_hdlc_target() -> String {
-    convert_to_string(crc32_iso_hdlc::get_iso_hdlc_target())
-}
-
-#[allow(unused)]
-pub unsafe fn get_iscsi_target() -> String {
-    convert_to_string(crc32_iscsi::get_iscsi_target())
-}
-
-fn convert_to_string(ptr: *const c_char) -> String {
-    unsafe {
-        // First ensure the pointer isn't null
-        assert!(!ptr.is_null());
-
-        // Convert to CStr - this handles finding the null terminator
-        let c_str = CStr::from_ptr(ptr);
-
-        // Convert to a regular string, handling any invalid UTF-8
-        c_str.to_string_lossy().into_owned()
-    }
-}
diff --git a/src/crc32/algorithm.rs b/src/crc32/algorithm.rs
index 28e0918..6074cd8 100644
--- a/src/crc32/algorithm.rs
+++ b/src/crc32/algorithm.rs
@@ -230,9 +230,9 @@ impl EnhancedCrcWidth for crate::structs::Width32 {
 #[inline]
 #[cfg_attr(
     any(target_arch = "x86", target_arch = "x86_64"),
-    target_feature(enable = "sse2,sse4.1,pclmulqdq")
+    target_feature(enable = "ssse3,sse4.1,pclmulqdq")
 )]
-#[cfg_attr(target_arch = "aarch64", target_feature(enable = "neon,aes"))]
+#[cfg_attr(target_arch = "aarch64", target_feature(enable = "aes"))]
 pub(crate) unsafe fn process_0_to_15<T: ArchOps, W: EnhancedCrcWidth>(
     data: &[u8],
     state: &mut CrcState<T::Vector>,
diff --git a/src/crc32/fusion/aarch64.rs b/src/crc32/fusion/aarch64.rs
new file mode 100644
index 0000000..c9f0207
--- /dev/null
+++ b/src/crc32/fusion/aarch64.rs
@@ -0,0 +1,1073 @@
+//! Provides CRC-32/ISCSI and CRC-32/ISO-HDLC calculations using a fusion of native CLMUL
+//! instructions and native CRC calculation instructions on aarch64.
+//!
+//! https://dougallj.wordpress.com/2022/05/22/faster-crc32-on-the-apple-m1/
+//!
+//! Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/
+//! with the help of Claude.ai using:
+//!
+//! ./generate -i neon -p crc32c -a v12e_v1
+//! ./generate -i neon_eor3 -p crc32 -a v9s3x2e_s3
+//! ./generate -i neon -p crc32 -a v12e_v1
+//!
+//! Modified as necessary for this Rust implementation.
+//!
+//! MIT licensed.
+
+#![cfg(target_arch = "aarch64")]
+
+use std::arch::aarch64::*;
+
+/// Safe wrapper for CRC32 iSCSI calculation
+#[inline]
+#[cfg(target_feature = "sha3")]
+pub fn crc32_iscsi(crc: u32, data: &[u8]) -> u32 {
+    unsafe {
+        const LARGE_BUFFER_THRESHOLD: usize = 1024;
+
+        // Select implementation based on buffer size
+        if data.len() <= LARGE_BUFFER_THRESHOLD {
+            crc32_iscsi_v12e_v1(crc, data.as_ptr(), data.len())
+        } else {
+            crc32_iscsi_eor3_v9s3x2e_s3(crc, data.as_ptr(), data.len())
+        }
+    }
+}
+
+#[inline]
+#[cfg(not(target_feature = "sha3"))]
+pub fn crc32_iscsi(crc: u32, data: &[u8]) -> u32 {
+    unsafe { crc32_iscsi_v12e_v1(crc, data.as_ptr(), data.len()) }
+}
+
+/// Safe wrapper for CRC32 ISO-HDLC calculation
+#[inline]
+#[cfg(target_feature = "sha3")]
+pub fn crc32_iso_hdlc(crc: u32, data: &[u8]) -> u32 {
+    unsafe {
+        const LARGE_BUFFER_THRESHOLD: usize = 1024;
+
+        // Select implementation based on buffer size
+        if data.len() <= LARGE_BUFFER_THRESHOLD {
+            crc32_iso_hdlc_v12e_v1(crc, data.as_ptr(), data.len())
+        } else {
+            crc32_iso_hdlc_eor3_v9s3x2e_s3(crc, data.as_ptr(), data.len())
+        }
+    }
+}
+
+#[inline]
+#[cfg(not(target_feature = "sha3"))]
+pub fn crc32_iso_hdlc(crc: u32, data: &[u8]) -> u32 {
+    unsafe { crc32_iso_hdlc_v12e_v1(crc, data.as_ptr(), data.len()) }
+}
+
+#[inline]
+#[cfg(target_feature = "sha3")]
+#[target_feature(enable = "aes")]
+unsafe fn clmul_lo_eor3(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    // Polynomial multiply low parts - convert u128 result to uint64x2_t
+    let result = vmull_p64(vgetq_lane_u64(a, 0), vgetq_lane_u64(b, 0));
+    vreinterpretq_u64_p128(result)
+}
+
+#[inline]
+#[cfg(target_feature = "sha3")]
+#[target_feature(enable = "aes")]
+unsafe fn clmul_hi_eor3(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    // Polynomial multiply high parts - convert u128 result to uint64x2_t
+    let result = vmull_p64(vgetq_lane_u64(a, 1), vgetq_lane_u64(b, 1));
+    vreinterpretq_u64_p128(result)
+}
+
+#[inline]
+#[cfg(target_feature = "sha3")]
+#[target_feature(enable = "aes")]
+unsafe fn clmul_scalar(a: u32, b: u32) -> uint64x2_t {
+    // Polynomial multiply scalars - convert u128 result to uint64x2_t
+    let result = vmull_p64(a as u64, b as u64);
+    vreinterpretq_u64_p128(result)
+}
+
+// x^n mod P, in log(n) time
+#[inline]
+#[cfg(target_feature = "sha3")]
+#[target_feature(enable = "aes")]
+unsafe fn xnmodp_crc32_iscsi(mut n: u64) -> u32 {
+    let mut stack = !1u64;
+    let mut acc: u32;
+    let mut low: u32;
+
+    while n > 191 {
+        stack = (stack << 1) + (n & 1);
+        n = (n >> 1) - 16;
+    }
+    stack = !stack;
+    acc = 0x80000000u32 >> (n & 31);
+    n >>= 5;
+
+    while n > 0 {
+        // ARM CRC32 instruction
+        acc = unsafe { __crc32cw(acc, 0) };
+        n -= 1;
+    }
+
+    while {
+        low = (stack & 1) as u32;
+        stack >>= 1;
+        stack != 0
+    } {
+        unsafe {
+            // Convert to polynomial type and square it
+            let x = vreinterpret_p8_u64(vmov_n_u64(acc as u64));
+            let squared = vmull_p8(x, x);
+            let y = vgetq_lane_u64(vreinterpretq_u64_p16(squared), 0);
+            acc = __crc32cd(0, y << low);
+        }
+    }
+    acc
+}
+
+#[inline]
+#[cfg(target_feature = "sha3")]
+#[target_feature(enable = "aes")]
+unsafe fn crc_shift_iscsi(crc: u32, nbytes: usize) -> uint64x2_t {
+    clmul_scalar(crc, xnmodp_crc32_iscsi((nbytes * 8 - 33) as u64))
+}
+
+// x^n mod P, in log(n) time
+#[inline]
+#[cfg(target_feature = "sha3")]
+#[target_feature(enable = "aes")]
+unsafe fn xnmodp_iso_hdlc(mut n: u64) -> u32 {
+    let mut stack = !1u64;
+    let mut acc: u32;
+    let mut low: u32;
+
+    while n > 191 {
+        stack = (stack << 1) + (n & 1);
+        n = (n >> 1) - 16;
+    }
+    stack = !stack;
+    acc = 0x80000000u32 >> (n & 31);
+    n >>= 5;
+
+    while n > 0 {
+        // ARM CRC32 instruction (ISO-HDLC uses standard CRC32, not CRC32C)
+        acc = unsafe { __crc32w(acc, 0) };
+        n -= 1;
+    }
+
+    while {
+        low = (stack & 1) as u32;
+        stack >>= 1;
+        stack != 0
+    } {
+        unsafe {
+            // Convert to polynomial type and square it
+            let x = vreinterpret_p8_u64(vmov_n_u64(acc as u64));
+            let squared = vmull_p8(x, x);
+            let y = vgetq_lane_u64(vreinterpretq_u64_p16(squared), 0);
+            acc = __crc32d(0, y << low);
+        }
+    }
+    acc
+}
+
+#[inline]
+#[cfg(target_feature = "sha3")]
+#[target_feature(enable = "aes")]
+unsafe fn crc_shift_iso_hdlc(crc: u32, nbytes: usize) -> uint64x2_t {
+    clmul_scalar(crc, xnmodp_iso_hdlc((nbytes * 8 - 33) as u64))
+}
+
+/// Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/
+/// using:
+///
+/// ./generate -i neon_eor3 -p crc32c -a v9s3x2e_s3
+#[inline]
+#[cfg(target_feature = "sha3")]
+#[target_feature(enable = "aes,sha3")]
+unsafe fn crc32_iscsi_eor3_v9s3x2e_s3(mut crc0: u32, mut buf: *const u8, mut len: usize) -> u32 {
+    // Align to 8-byte boundary
+    while len > 0 && (buf as usize & 7) != 0 {
+        crc0 = __crc32cb(crc0, *buf);
+        buf = buf.add(1);
+        len -= 1;
+    }
+
+    // Handle 8-byte alignment
+    if (buf as usize & 8) != 0 && len >= 8 {
+        crc0 = __crc32cd(crc0, *(buf as *const u64));
+        buf = buf.add(8);
+        len -= 8;
+    }
+
+    if len >= 192 {
+        let end = buf.add(len);
+        let blk = len / 192;
+        let klen = blk * 16;
+        let buf2 = buf.add(klen * 3);
+        let limit = buf.add(klen).sub(32);
+        let mut crc1 = 0u32;
+        let mut crc2 = 0u32;
+
+        // First vector chunk
+        let mut x0 = vld1q_u64(buf2 as *const u64);
+        let mut x1 = vld1q_u64(buf2.add(16) as *const u64);
+        let mut x2 = vld1q_u64(buf2.add(32) as *const u64);
+        let mut x3 = vld1q_u64(buf2.add(48) as *const u64);
+        let mut x4 = vld1q_u64(buf2.add(64) as *const u64);
+        let mut x5 = vld1q_u64(buf2.add(80) as *const u64);
+        let mut x6 = vld1q_u64(buf2.add(96) as *const u64);
+        let mut x7 = vld1q_u64(buf2.add(112) as *const u64);
+        let mut x8 = vld1q_u64(buf2.add(128) as *const u64);
+
+        let k_vals: [u64; 2] = [0x7e908048, 0xc96cfdc0];
+        let mut k = vld1q_u64(k_vals.as_ptr());
+        let mut buf2 = buf2.add(144);
+
+        // Main loop
+        while buf <= limit {
+            let y0 = clmul_lo_eor3(x0, k);
+            x0 = clmul_hi_eor3(x0, k);
+            let y1 = clmul_lo_eor3(x1, k);
+            x1 = clmul_hi_eor3(x1, k);
+            let y2 = clmul_lo_eor3(x2, k);
+            x2 = clmul_hi_eor3(x2, k);
+            let y3 = clmul_lo_eor3(x3, k);
+            x3 = clmul_hi_eor3(x3, k);
+            let y4 = clmul_lo_eor3(x4, k);
+            x4 = clmul_hi_eor3(x4, k);
+            let y5 = clmul_lo_eor3(x5, k);
+            x5 = clmul_hi_eor3(x5, k);
+            let y6 = clmul_lo_eor3(x6, k);
+            x6 = clmul_hi_eor3(x6, k);
+            let y7 = clmul_lo_eor3(x7, k);
+            x7 = clmul_hi_eor3(x7, k);
+            let y8 = clmul_lo_eor3(x8, k);
+            x8 = clmul_hi_eor3(x8, k);
+
+            x0 = veor3q_u64(x0, y0, vld1q_u64(buf2 as *const u64));
+            x1 = veor3q_u64(x1, y1, vld1q_u64(buf2.add(16) as *const u64));
+            x2 = veor3q_u64(x2, y2, vld1q_u64(buf2.add(32) as *const u64));
+            x3 = veor3q_u64(x3, y3, vld1q_u64(buf2.add(48) as *const u64));
+            x4 = veor3q_u64(x4, y4, vld1q_u64(buf2.add(64) as *const u64));
+            x5 = veor3q_u64(x5, y5, vld1q_u64(buf2.add(80) as *const u64));
+            x6 = veor3q_u64(x6, y6, vld1q_u64(buf2.add(96) as *const u64));
+            x7 = veor3q_u64(x7, y7, vld1q_u64(buf2.add(112) as *const u64));
+            x8 = veor3q_u64(x8, y8, vld1q_u64(buf2.add(128) as *const u64));
+
+            crc0 = __crc32cd(crc0, *(buf as *const u64));
+            crc1 = __crc32cd(crc1, *(buf.add(klen) as *const u64));
+            crc2 = __crc32cd(crc2, *(buf.add(klen * 2) as *const u64));
+            crc0 = __crc32cd(crc0, *(buf.add(8) as *const u64));
+            crc1 = __crc32cd(crc1, *(buf.add(klen + 8) as *const u64));
+            crc2 = __crc32cd(crc2, *(buf.add(klen * 2 + 8) as *const u64));
+
+            buf = buf.add(16);
+            buf2 = buf2.add(144);
+        }
+
+        // Reduce x0 ... x8 to just x0
+        let k_vals: [u64; 2] = [0xf20c0dfe, 0x493c7d27];
+        k = vld1q_u64(k_vals.as_ptr());
+
+        let y0 = clmul_lo_eor3(x0, k);
+        x0 = clmul_hi_eor3(x0, k);
+        x0 = veor3q_u64(x0, y0, x1);
+        x1 = x2;
+        x2 = x3;
+        x3 = x4;
+        x4 = x5;
+        x5 = x6;
+        x6 = x7;
+        x7 = x8;
+
+        let y0 = clmul_lo_eor3(x0, k);
+        x0 = clmul_hi_eor3(x0, k);
+        let y2 = clmul_lo_eor3(x2, k);
+        x2 = clmul_hi_eor3(x2, k);
+        let y4 = clmul_lo_eor3(x4, k);
+        x4 = clmul_hi_eor3(x4, k);
+        let y6 = clmul_lo_eor3(x6, k);
+        x6 = clmul_hi_eor3(x6, k);
+
+        x0 = veor3q_u64(x0, y0, x1);
+        x2 = veor3q_u64(x2, y2, x3);
+        x4 = veor3q_u64(x4, y4, x5);
+        x6 = veor3q_u64(x6, y6, x7);
+
+        let k_vals: [u64; 2] = [0x3da6d0cb, 0xba4fc28e];
+        k = vld1q_u64(k_vals.as_ptr());
+
+        let y0 = clmul_lo_eor3(x0, k);
+        x0 = clmul_hi_eor3(x0, k);
+        let y4 = clmul_lo_eor3(x4, k);
+        x4 = clmul_hi_eor3(x4, k);
+
+        x0 = veor3q_u64(x0, y0, x2);
+        x4 = veor3q_u64(x4, y4, x6);
+
+        let k_vals: [u64; 2] = [0x740eef02, 0x9e4addf8];
+        k = vld1q_u64(k_vals.as_ptr());
+
+        let y0 = clmul_lo_eor3(x0, k);
+        x0 = clmul_hi_eor3(x0, k);
+        x0 = veor3q_u64(x0, y0, x4);
+
+        // Final scalar chunk
+        crc0 = __crc32cd(crc0, *(buf as *const u64));
+        crc1 = __crc32cd(crc1, *(buf.add(klen) as *const u64));
+        crc2 = __crc32cd(crc2, *(buf.add(klen * 2) as *const u64));
+        crc0 = __crc32cd(crc0, *(buf.add(8) as *const u64));
+        crc1 = __crc32cd(crc1, *(buf.add(klen + 8) as *const u64));
+        crc2 = __crc32cd(crc2, *(buf.add(klen * 2 + 8) as *const u64));
+
+        let vc0 = crc_shift_iscsi(crc0, klen * 2 + blk * 144);
+        let vc1 = crc_shift_iscsi(crc1, klen + blk * 144);
+        let vc2 = crc_shift_iscsi(crc2, blk * 144);
+        let vc = vgetq_lane_u64(veor3q_u64(vc0, vc1, vc2), 0);
+
+        // Reduce 128 bits to 32 bits, and multiply by x^32
+        crc0 = __crc32cd(0, vgetq_lane_u64(x0, 0));
+        crc0 = __crc32cd(crc0, vc ^ vgetq_lane_u64(x0, 1));
+
+        buf = buf2;
+        len = end.offset_from(buf) as usize;
+    }
+
+    if len >= 32 {
+        let klen = ((len - 8) / 24) * 8;
+        let mut crc1 = 0u32;
+        let mut crc2 = 0u32;
+
+        // Main loop
+        loop {
+            crc0 = __crc32cd(crc0, *(buf as *const u64));
+            crc1 = __crc32cd(crc1, *(buf.add(klen) as *const u64));
+            crc2 = __crc32cd(crc2, *(buf.add(klen * 2) as *const u64));
+            buf = buf.add(8);
+            len -= 24;
+            if len < 32 {
+                break;
+            }
+        }
+
+        let vc0 = crc_shift_iscsi(crc0, klen * 2 + 8);
+        let vc1 = crc_shift_iscsi(crc1, klen + 8);
+        let vc = vgetq_lane_u64(veorq_u64(vc0, vc1), 0);
+
+        // Final 8 bytes
+        buf = buf.add(klen * 2);
+        crc0 = crc2;
+        crc0 = __crc32cd(crc0, *(buf as *const u64) ^ vc);
+        buf = buf.add(8);
+        len -= 8;
+    }
+
+    while len >= 8 {
+        crc0 = __crc32cd(crc0, *(buf as *const u64));
+        buf = buf.add(8);
+        len -= 8;
+    }
+
+    while len > 0 {
+        crc0 = __crc32cb(crc0, *buf);
+        buf = buf.add(1);
+        len -= 1;
+    }
+
+    crc0
+}
+
+#[inline]
+#[target_feature(enable = "aes")]
+unsafe fn clmul_lo_e(a: uint64x2_t, b: uint64x2_t, c: uint64x2_t) -> uint64x2_t {
+    // Polynomial multiply low parts and XOR with c
+    let mul_result = vmull_p64(vgetq_lane_u64(a, 0), vgetq_lane_u64(b, 0));
+    let mul_vec = vreinterpretq_u64_p128(mul_result);
+    veorq_u64(mul_vec, c)
+}
+
+#[inline]
+#[target_feature(enable = "aes")]
+unsafe fn clmul_hi_e(a: uint64x2_t, b: uint64x2_t, c: uint64x2_t) -> uint64x2_t {
+    // Polynomial multiply high parts and XOR with c
+    let mul_result = vmull_p64(vgetq_lane_u64(a, 1), vgetq_lane_u64(b, 1));
+    let mul_vec = vreinterpretq_u64_p128(mul_result);
+    veorq_u64(mul_vec, c)
+}
+
+/// Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/
+/// using:
+///
+/// ./generate -i neon -p crc32c -a v12e_v1
+#[inline]
+#[target_feature(enable = "aes")]
+unsafe fn crc32_iscsi_v12e_v1(mut crc0: u32, mut buf: *const u8, mut len: usize) -> u32 {
+    // Align to 8-byte boundary
+    while len > 0 && (buf as usize & 7) != 0 {
+        crc0 = __crc32cb(crc0, *buf);
+        buf = buf.add(1);
+        len -= 1;
+    }
+
+    // Handle 8-byte alignment
+    if (buf as usize & 8) != 0 && len >= 8 {
+        crc0 = __crc32cd(crc0, *(buf as *const u64));
+        buf = buf.add(8);
+        len -= 8;
+    }
+
+    if len >= 192 {
+        let end = buf.add(len);
+        let limit = buf.add(len - 192);
+
+        // First vector chunk
+        let mut x0 = vld1q_u64(buf as *const u64);
+        let mut x1 = vld1q_u64(buf.add(16) as *const u64);
+        let mut x2 = vld1q_u64(buf.add(32) as *const u64);
+        let mut x3 = vld1q_u64(buf.add(48) as *const u64);
+        let mut x4 = vld1q_u64(buf.add(64) as *const u64);
+        let mut x5 = vld1q_u64(buf.add(80) as *const u64);
+        let mut x6 = vld1q_u64(buf.add(96) as *const u64);
+        let mut x7 = vld1q_u64(buf.add(112) as *const u64);
+        let mut x8 = vld1q_u64(buf.add(128) as *const u64);
+        let mut x9 = vld1q_u64(buf.add(144) as *const u64);
+        let mut x10 = vld1q_u64(buf.add(160) as *const u64);
+        let mut x11 = vld1q_u64(buf.add(176) as *const u64);
+
+        let k_vals: [u64; 2] = [0xa87ab8a8, 0xab7aff2a];
+        let mut k = vld1q_u64(k_vals.as_ptr());
+
+        // Create CRC vector and XOR with first vector
+        let crc_vec = vsetq_lane_u64(crc0 as u64, vmovq_n_u64(0), 0);
+        x0 = veorq_u64(crc_vec, x0);
+        buf = buf.add(192);
+
+        // Main loop
+        while buf <= limit {
+            let y0 = clmul_lo_e(x0, k, vld1q_u64(buf as *const u64));
+            x0 = clmul_hi_e(x0, k, y0);
+            let y1 = clmul_lo_e(x1, k, vld1q_u64(buf.add(16) as *const u64));
+            x1 = clmul_hi_e(x1, k, y1);
+            let y2 = clmul_lo_e(x2, k, vld1q_u64(buf.add(32) as *const u64));
+            x2 = clmul_hi_e(x2, k, y2);
+            let y3 = clmul_lo_e(x3, k, vld1q_u64(buf.add(48) as *const u64));
+            x3 = clmul_hi_e(x3, k, y3);
+            let y4 = clmul_lo_e(x4, k, vld1q_u64(buf.add(64) as *const u64));
+            x4 = clmul_hi_e(x4, k, y4);
+            let y5 = clmul_lo_e(x5, k, vld1q_u64(buf.add(80) as *const u64));
+            x5 = clmul_hi_e(x5, k, y5);
+            let y6 = clmul_lo_e(x6, k, vld1q_u64(buf.add(96) as *const u64));
+            x6 = clmul_hi_e(x6, k, y6);
+            let y7 = clmul_lo_e(x7, k, vld1q_u64(buf.add(112) as *const u64));
+            x7 = clmul_hi_e(x7, k, y7);
+            let y8 = clmul_lo_e(x8, k, vld1q_u64(buf.add(128) as *const u64));
+            x8 = clmul_hi_e(x8, k, y8);
+            let y9 = clmul_lo_e(x9, k, vld1q_u64(buf.add(144) as *const u64));
+            x9 = clmul_hi_e(x9, k, y9);
+            let y10 = clmul_lo_e(x10, k, vld1q_u64(buf.add(160) as *const u64));
+            x10 = clmul_hi_e(x10, k, y10);
+            let y11 = clmul_lo_e(x11, k, vld1q_u64(buf.add(176) as *const u64));
+            x11 = clmul_hi_e(x11, k, y11);
+            buf = buf.add(192);
+        }
+
+        // Reduce x0 ... x11 to just x0
+        let k_vals: [u64; 2] = [0xf20c0dfe, 0x493c7d27];
+        k = vld1q_u64(k_vals.as_ptr());
+
+        let y0 = clmul_lo_e(x0, k, x1);
+        x0 = clmul_hi_e(x0, k, y0);
+        let y2 = clmul_lo_e(x2, k, x3);
+        x2 = clmul_hi_e(x2, k, y2);
+        let y4 = clmul_lo_e(x4, k, x5);
+        x4 = clmul_hi_e(x4, k, y4);
+        let y6 = clmul_lo_e(x6, k, x7);
+        x6 = clmul_hi_e(x6, k, y6);
+        let y8 = clmul_lo_e(x8, k, x9);
+        x8 = clmul_hi_e(x8, k, y8);
+        let y10 = clmul_lo_e(x10, k, x11);
+        x10 = clmul_hi_e(x10, k, y10);
+
+        let k_vals: [u64; 2] = [0x3da6d0cb, 0xba4fc28e];
+        k = vld1q_u64(k_vals.as_ptr());
+
+        let y0 = clmul_lo_e(x0, k, x2);
+        x0 = clmul_hi_e(x0, k, y0);
+        let y4 = clmul_lo_e(x4, k, x6);
+        x4 = clmul_hi_e(x4, k, y4);
+        let y8 = clmul_lo_e(x8, k, x10);
+        x8 = clmul_hi_e(x8, k, y8);
+
+        let k_vals: [u64; 2] = [0x740eef02, 0x9e4addf8];
+        k = vld1q_u64(k_vals.as_ptr());
+
+        let y0 = clmul_lo_e(x0, k, x4);
+        x0 = clmul_hi_e(x0, k, y0);
+        x4 = x8;
+        let y0 = clmul_lo_e(x0, k, x4);
+        x0 = clmul_hi_e(x0, k, y0);
+
+        // Reduce 128 bits to 32 bits, and multiply by x^32
+        crc0 = __crc32cd(0, vgetq_lane_u64(x0, 0));
+        crc0 = __crc32cd(crc0, vgetq_lane_u64(x0, 1));
+        len = end.offset_from(buf) as usize;
+    }
+
+    if len >= 16 {
+        // First vector chunk
+        let mut x0 = vld1q_u64(buf as *const u64);
+
+        let k_vals: [u64; 2] = [0xf20c0dfe, 0x493c7d27];
+        let k = vld1q_u64(k_vals.as_ptr());
+
+        // Create CRC vector and XOR with first vector
+        let crc_vec = vsetq_lane_u64(crc0 as u64, vmovq_n_u64(0), 0);
+        x0 = veorq_u64(crc_vec, x0);
+        buf = buf.add(16);
+        len -= 16;
+
+        // Main loop
+        while len >= 16 {
+            let y0 = clmul_lo_e(x0, k, vld1q_u64(buf as *const u64));
+            x0 = clmul_hi_e(x0, k, y0);
+            buf = buf.add(16);
+            len -= 16;
+        }
+
+        // Reduce 128 bits to 32 bits, and multiply by x^32
+        crc0 = __crc32cd(0, vgetq_lane_u64(x0, 0));
+        crc0 = __crc32cd(crc0, vgetq_lane_u64(x0, 1));
+    }
+
+    while len >= 8 {
+        crc0 = __crc32cd(crc0, *(buf as *const u64));
+        buf = buf.add(8);
+        len -= 8;
+    }
+
+    while len > 0 {
+        crc0 = __crc32cb(crc0, *buf);
+        buf = buf.add(1);
+        len -= 1;
+    }
+
+    crc0
+}
+
+/// Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/
+/// using:
+///
+/// ./generate -i neon_eor3 -p crc32 -a v9s3x2e_s3
+#[inline]
+#[cfg(target_feature = "sha3")]
+#[target_feature(enable = "aes,sha3")]
+unsafe fn crc32_iso_hdlc_eor3_v9s3x2e_s3(mut crc0: u32, mut buf: *const u8, mut len: usize) -> u32 {
+    // Align to 8-byte boundary
+    while len > 0 && (buf as usize & 7) != 0 {
+        crc0 = __crc32b(crc0, *buf);
+        buf = buf.add(1);
+        len -= 1;
+    }
+
+    // Handle 8-byte alignment
+    if (buf as usize & 8) != 0 && len >= 8 {
+        crc0 = __crc32d(crc0, *(buf as *const u64));
+        buf = buf.add(8);
+        len -= 8;
+    }
+
+    if len >= 192 {
+        let end = buf.add(len);
+        let blk = len / 192;
+        let klen = blk * 16;
+        let buf2 = buf.add(klen * 3);
+        let limit = buf.add(klen).sub(32);
+        let mut crc1 = 0u32;
+        let mut crc2 = 0u32;
+
+        // First vector chunk
+        let mut x0 = vld1q_u64(buf2 as *const u64);
+        let mut x1 = vld1q_u64(buf2.add(16) as *const u64);
+        let mut x2 = vld1q_u64(buf2.add(32) as *const u64);
+        let mut x3 = vld1q_u64(buf2.add(48) as *const u64);
+        let mut x4 = vld1q_u64(buf2.add(64) as *const u64);
+        let mut x5 = vld1q_u64(buf2.add(80) as *const u64);
+        let mut x6 = vld1q_u64(buf2.add(96) as *const u64);
+        let mut x7 = vld1q_u64(buf2.add(112) as *const u64);
+        let mut x8 = vld1q_u64(buf2.add(128) as *const u64);
+
+        // ISO-HDLC specific constants
+        let k_vals: [u64; 2] = [0x26b70c3d, 0x3f41287a];
+        let mut k = vld1q_u64(k_vals.as_ptr());
+        let mut buf2 = buf2.add(144);
+
+        // Main loop
+        while buf <= limit {
+            let y0 = clmul_lo_eor3(x0, k);
+            x0 = clmul_hi_eor3(x0, k);
+            let y1 = clmul_lo_eor3(x1, k);
+            x1 = clmul_hi_eor3(x1, k);
+            let y2 = clmul_lo_eor3(x2, k);
+            x2 = clmul_hi_eor3(x2, k);
+            let y3 = clmul_lo_eor3(x3, k);
+            x3 = clmul_hi_eor3(x3, k);
+            let y4 = clmul_lo_eor3(x4, k);
+            x4 = clmul_hi_eor3(x4, k);
+            let y5 = clmul_lo_eor3(x5, k);
+            x5 = clmul_hi_eor3(x5, k);
+            let y6 = clmul_lo_eor3(x6, k);
+            x6 = clmul_hi_eor3(x6, k);
+            let y7 = clmul_lo_eor3(x7, k);
+            x7 = clmul_hi_eor3(x7, k);
+            let y8 = clmul_lo_eor3(x8, k);
+            x8 = clmul_hi_eor3(x8, k);
+
+            x0 = veor3q_u64(x0, y0, vld1q_u64(buf2 as *const u64));
+            x1 = veor3q_u64(x1, y1, vld1q_u64(buf2.add(16) as *const u64));
+            x2 = veor3q_u64(x2, y2, vld1q_u64(buf2.add(32) as *const u64));
+            x3 = veor3q_u64(x3, y3, vld1q_u64(buf2.add(48) as *const u64));
+            x4 = veor3q_u64(x4, y4, vld1q_u64(buf2.add(64) as *const u64));
+            x5 = veor3q_u64(x5, y5, vld1q_u64(buf2.add(80) as *const u64));
+            x6 = veor3q_u64(x6, y6, vld1q_u64(buf2.add(96) as *const u64));
+            x7 = veor3q_u64(x7, y7, vld1q_u64(buf2.add(112) as *const u64));
+            x8 = veor3q_u64(x8, y8, vld1q_u64(buf2.add(128) as *const u64));
+
+            crc0 = __crc32d(crc0, *(buf as *const u64));
+            crc1 = __crc32d(crc1, *(buf.add(klen) as *const u64));
+            crc2 = __crc32d(crc2, *(buf.add(klen * 2) as *const u64));
+            crc0 = __crc32d(crc0, *(buf.add(8) as *const u64));
+            crc1 = __crc32d(crc1, *(buf.add(klen + 8) as *const u64));
+            crc2 = __crc32d(crc2, *(buf.add(klen * 2 + 8) as *const u64));
+
+            buf = buf.add(16);
+            buf2 = buf2.add(144);
+        }
+
+        // Reduce x0 ... x8 to just x0
+        let k_vals: [u64; 2] = [0xae689191, 0xccaa009e];
+        k = vld1q_u64(k_vals.as_ptr());
+
+        let y0 = clmul_lo_eor3(x0, k);
+        x0 = clmul_hi_eor3(x0, k);
+        x0 = veor3q_u64(x0, y0, x1);
+        x1 = x2;
+        x2 = x3;
+        x3 = x4;
+        x4 = x5;
+        x5 = x6;
+        x6 = x7;
+        x7 = x8;
+
+        let y0 = clmul_lo_eor3(x0, k);
+        x0 = clmul_hi_eor3(x0, k);
+        let y2 = clmul_lo_eor3(x2, k);
+        x2 = clmul_hi_eor3(x2, k);
+        let y4 = clmul_lo_eor3(x4, k);
+        x4 = clmul_hi_eor3(x4, k);
+        let y6 = clmul_lo_eor3(x6, k);
+        x6 = clmul_hi_eor3(x6, k);
+
+        x0 = veor3q_u64(x0, y0, x1);
+        x2 = veor3q_u64(x2, y2, x3);
+        x4 = veor3q_u64(x4, y4, x5);
+        x6 = veor3q_u64(x6, y6, x7);
+
+        let k_vals: [u64; 2] = [0xf1da05aa, 0x81256527];
+        k = vld1q_u64(k_vals.as_ptr());
+
+        let y0 = clmul_lo_eor3(x0, k);
+        x0 = clmul_hi_eor3(x0, k);
+        let y4 = clmul_lo_eor3(x4, k);
+        x4 = clmul_hi_eor3(x4, k);
+
+        x0 = veor3q_u64(x0, y0, x2);
+        x4 = veor3q_u64(x4, y4, x6);
+
+        let k_vals: [u64; 2] = [0x8f352d95, 0x1d9513d7];
+        k = vld1q_u64(k_vals.as_ptr());
+
+        let y0 = clmul_lo_eor3(x0, k);
+        x0 = clmul_hi_eor3(x0, k);
+        x0 = veor3q_u64(x0, y0, x4);
+
+        // Final scalar chunk
+        crc0 = __crc32d(crc0, *(buf as *const u64));
+        crc1 = __crc32d(crc1, *(buf.add(klen) as *const u64));
+        crc2 = __crc32d(crc2, *(buf.add(klen * 2) as *const u64));
+        crc0 = __crc32d(crc0, *(buf.add(8) as *const u64));
+        crc1 = __crc32d(crc1, *(buf.add(klen + 8) as *const u64));
+        crc2 = __crc32d(crc2, *(buf.add(klen * 2 + 8) as *const u64));
+
+        let vc0 = crc_shift_iso_hdlc(crc0, klen * 2 + blk * 144);
+        let vc1 = crc_shift_iso_hdlc(crc1, klen + blk * 144);
+        let vc2 = crc_shift_iso_hdlc(crc2, blk * 144);
+        let vc = vgetq_lane_u64(veor3q_u64(vc0, vc1, vc2), 0);
+
+        // Reduce 128 bits to 32 bits, and multiply by x^32
+        crc0 = __crc32d(0, vgetq_lane_u64(x0, 0));
+        crc0 = __crc32d(crc0, vc ^ vgetq_lane_u64(x0, 1));
+
+        buf = buf2;
+        len = end.offset_from(buf) as usize;
+    }
+
+    if len >= 32 {
+        let klen = ((len - 8) / 24) * 8;
+        let mut crc1 = 0u32;
+        let mut crc2 = 0u32;
+
+        // Main loop
+        loop {
+            crc0 = __crc32d(crc0, *(buf as *const u64));
+            crc1 = __crc32d(crc1, *(buf.add(klen) as *const u64));
+            crc2 = __crc32d(crc2, *(buf.add(klen * 2) as *const u64));
+            buf = buf.add(8);
+            len -= 24;
+            if len < 32 {
+                break;
+            }
+        }
+
+        let vc0 = crc_shift_iso_hdlc(crc0, klen * 2 + 8);
+        let vc1 = crc_shift_iso_hdlc(crc1, klen + 8);
+        let vc = vgetq_lane_u64(veorq_u64(vc0, vc1), 0);
+
+        // Final 8 bytes
+        buf = buf.add(klen * 2);
+        crc0 = crc2;
+        crc0 = __crc32d(crc0, *(buf as *const u64) ^ vc);
+        buf = buf.add(8);
+        len -= 8;
+    }
+
+    while len >= 8 {
+        crc0 = __crc32d(crc0, *(buf as *const u64));
+        buf = buf.add(8);
+        len -= 8;
+    }
+
+    while len > 0 {
+        crc0 = __crc32b(crc0, *buf);
+        buf = buf.add(1);
+        len -= 1;
+    }
+
+    crc0
+}
+
+/// Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/
+/// using:
+///
+/// ./generate -i neon -p crc32 -a v12e_v1
+#[inline]
+#[target_feature(enable = "aes")]
+unsafe fn crc32_iso_hdlc_v12e_v1(mut crc0: u32, mut buf: *const u8, mut len: usize) -> u32 {
+    // Align to 8-byte boundary
+    while len > 0 && (buf as usize & 7) != 0 {
+        crc0 = __crc32b(crc0, *buf);
+        buf = buf.add(1);
+        len -= 1;
+    }
+
+    // Handle 8-byte alignment
+    if (buf as usize & 8) != 0 && len >= 8 {
+        crc0 = __crc32d(crc0, *(buf as *const u64));
+        buf = buf.add(8);
+        len -= 8;
+    }
+
+    if len >= 192 {
+        let end = buf.add(len);
+        let limit = buf.add(len - 192);
+
+        // First vector chunk
+        let mut x0 = vld1q_u64(buf as *const u64);
+        let mut x1 = vld1q_u64(buf.add(16) as *const u64);
+        let mut x2 = vld1q_u64(buf.add(32) as *const u64);
+        let mut x3 = vld1q_u64(buf.add(48) as *const u64);
+        let mut x4 = vld1q_u64(buf.add(64) as *const u64);
+        let mut x5 = vld1q_u64(buf.add(80) as *const u64);
+        let mut x6 = vld1q_u64(buf.add(96) as *const u64);
+        let mut x7 = vld1q_u64(buf.add(112) as *const u64);
+        let mut x8 = vld1q_u64(buf.add(128) as *const u64);
+        let mut x9 = vld1q_u64(buf.add(144) as *const u64);
+        let mut x10 = vld1q_u64(buf.add(160) as *const u64);
+        let mut x11 = vld1q_u64(buf.add(176) as *const u64);
+
+        // ISO-HDLC specific constants for small implementation
+        let k_vals: [u64; 2] = [0x596c8d81, 0xf5e48c85];
+        let mut k = vld1q_u64(k_vals.as_ptr());
+
+        // Create CRC vector and XOR with first vector
+        let crc_vec = vsetq_lane_u64(crc0 as u64, vmovq_n_u64(0), 0);
+        x0 = veorq_u64(crc_vec, x0);
+        buf = buf.add(192);
+
+        // Main loop
+        while buf <= limit {
+            let y0 = clmul_lo_e(x0, k, vld1q_u64(buf as *const u64));
+            x0 = clmul_hi_e(x0, k, y0);
+            let y1 = clmul_lo_e(x1, k, vld1q_u64(buf.add(16) as *const u64));
+            x1 = clmul_hi_e(x1, k, y1);
+            let y2 = clmul_lo_e(x2, k, vld1q_u64(buf.add(32) as *const u64));
+            x2 = clmul_hi_e(x2, k, y2);
+            let y3 = clmul_lo_e(x3, k, vld1q_u64(buf.add(48) as *const u64));
+            x3 = clmul_hi_e(x3, k, y3);
+            let y4 = clmul_lo_e(x4, k, vld1q_u64(buf.add(64) as *const u64));
+            x4 = clmul_hi_e(x4, k, y4);
+            let y5 = clmul_lo_e(x5, k, vld1q_u64(buf.add(80) as *const u64));
+            x5 = clmul_hi_e(x5, k, y5);
+            let y6 = clmul_lo_e(x6, k, vld1q_u64(buf.add(96) as *const u64));
+            x6 = clmul_hi_e(x6, k, y6);
+            let y7 = clmul_lo_e(x7, k, vld1q_u64(buf.add(112) as *const u64));
+            x7 = clmul_hi_e(x7, k, y7);
+            let y8 = clmul_lo_e(x8, k, vld1q_u64(buf.add(128) as *const u64));
+            x8 = clmul_hi_e(x8, k, y8);
+            let y9 = clmul_lo_e(x9, k, vld1q_u64(buf.add(144) as *const u64));
+            x9 = clmul_hi_e(x9, k, y9);
+            let y10 = clmul_lo_e(x10, k, vld1q_u64(buf.add(160) as *const u64));
+            x10 = clmul_hi_e(x10, k, y10);
+            let y11 = clmul_lo_e(x11, k, vld1q_u64(buf.add(176) as *const u64));
+            x11 = clmul_hi_e(x11, k, y11);
+            buf = buf.add(192);
+        }
+
+        // Reduce x0 ... x11 to just x0
+        let k_vals: [u64; 2] = [0xae689191, 0xccaa009e];
+        k = vld1q_u64(k_vals.as_ptr());
+
+        let y0 = clmul_lo_e(x0, k, x1);
+        x0 = clmul_hi_e(x0, k, y0);
+        let y2 = clmul_lo_e(x2, k, x3);
+        x2 = clmul_hi_e(x2, k, y2);
+        let y4 = clmul_lo_e(x4, k, x5);
+        x4 = clmul_hi_e(x4, k, y4);
+        let y6 = clmul_lo_e(x6, k, x7);
+        x6 = clmul_hi_e(x6, k, y6);
+        let y8 = clmul_lo_e(x8, k, x9);
+        x8 = clmul_hi_e(x8, k, y8);
+        let y10 = clmul_lo_e(x10, k, x11);
+        x10 = clmul_hi_e(x10, k, y10);
+
+        let k_vals: [u64; 2] = [0xf1da05aa, 0x81256527];
+        k = vld1q_u64(k_vals.as_ptr());
+
+        let y0 = clmul_lo_e(x0, k, x2);
+        x0 = clmul_hi_e(x0, k, y0);
+        let y4 = clmul_lo_e(x4, k, x6);
+        x4 = clmul_hi_e(x4, k, y4);
+        let y8 = clmul_lo_e(x8, k, x10);
+        x8 = clmul_hi_e(x8, k, y8);
+
+        let k_vals: [u64; 2] = [0x8f352d95, 0x1d9513d7];
+        k = vld1q_u64(k_vals.as_ptr());
+
+        let y0 = clmul_lo_e(x0, k, x4);
+        x0 = clmul_hi_e(x0, k, y0);
+        x4 = x8;
+        let y0 = clmul_lo_e(x0, k, x4);
+        x0 = clmul_hi_e(x0, k, y0);
+
+        // Reduce 128 bits to 32 bits, and multiply by x^32
+        crc0 = __crc32d(0, vgetq_lane_u64(x0, 0));
+        crc0 = __crc32d(crc0, vgetq_lane_u64(x0, 1));
+        len = end.offset_from(buf) as usize;
+    }
+
+    if len >= 16 {
+        // First vector chunk
+        let mut x0 = vld1q_u64(buf as *const u64);
+
+        let k_vals: [u64; 2] = [0xae689191, 0xccaa009e];
+        let k = vld1q_u64(k_vals.as_ptr());
+
+        // Create CRC vector and XOR with first vector
+        let crc_vec = vsetq_lane_u64(crc0 as u64, vmovq_n_u64(0), 0);
+        x0 = veorq_u64(crc_vec, x0);
+        buf = buf.add(16);
+        len -= 16;
+
+        // Main loop
+        while len >= 16 {
+            let y0 = clmul_lo_e(x0, k, vld1q_u64(buf as *const u64));
+            x0 = clmul_hi_e(x0, k, y0);
+            buf = buf.add(16);
+            len -= 16;
+        }
+
+        // Reduce 128 bits to 32 bits, and multiply by x^32
+        crc0 = __crc32d(0, vgetq_lane_u64(x0, 0));
+        crc0 = __crc32d(crc0, vgetq_lane_u64(x0, 1));
+    }
+
+    while len >= 8 {
+        crc0 = __crc32d(crc0, *(buf as *const u64));
+        buf = buf.add(8);
+        len -= 8;
+    }
+
+    while len > 0 {
+        crc0 = __crc32b(crc0, *buf);
+        buf = buf.add(1);
+        len -= 1;
+    }
+
+    crc0
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::test::consts::TEST_CHECK_STRING;
+    use crc::{Crc, Table};
+    use rand::{rng, Rng};
+
+    const RUST_CRC32_ISO_HDLC: Crc<u32, Table<16>> =
+        Crc::<u32, Table<16>>::new(&crc::CRC_32_ISO_HDLC);
+
+    const RUST_CRC32_ISCSI: Crc<u32, Table<16>> = Crc::<u32, Table<16>>::new(&crc::CRC_32_ISCSI);
+
+    #[test]
+    fn test_crc32_iso_hdlc_check() {
+        assert_eq!(
+            crc32_iso_hdlc(0xffffffff, TEST_CHECK_STRING) ^ 0xffffffff,
+            0xcbf43926
+        );
+    }
+
+    #[test]
+    fn test_crc32_iso_hdlc_small_all_lengths() {
+        for len in 1..=255 {
+            crc32_iso_hdlc_random(len)
+        }
+    }
+
+    #[test]
+    fn test_crc32_iso_hdlc_medium_lengths() {
+        // Test each length from 256 to 1024, which should fold and include handling remainders
+        for len in 256..=1024 {
+            crc32_iso_hdlc_random(len)
+        }
+    }
+
+    #[test]
+    fn test_crc32_iso_hdlc_large_lengths() {
+        // Test 1 MiB just before, at, and just after the folding boundaries
+        for len in 1048575..1048577 {
+            crc32_iso_hdlc_random(len)
+        }
+    }
+
+    #[test]
+    fn test_crc32_iscsi_check() {
+        assert_eq!(
+            crc32_iscsi(0xffffffff, TEST_CHECK_STRING) ^ 0xffffffff,
+            0xe3069283
+        );
+    }
+
+    #[test]
+    fn test_crc32_iscsi_small_all_lengths() {
+        for len in 1..=255 {
+            crc32_iscsi_random(len);
+        }
+    }
+
+    #[test]
+    fn test_crc32_iscsi_medium_lengths() {
+        // Test each length from 256 to 1024, which should fold and include handling remainders
+        for len in 256..=1024 {
+            crc32_iscsi_random(len);
+        }
+    }
+
+    #[test]
+    fn test_crc32_iscsi_large_lengths() {
+        // Test 1 MiB just before, at, and just after the folding boundaries
+        for len in 1048575..1048577 {
+            crc32_iscsi_random(len);
+        }
+    }
+
+    #[cfg(target_feature = "sha3")]
+    fn crc32_iso_hdlc_random(len: usize) {
+        let mut data = vec![0u8; len];
+        rng().fill(&mut data[..]);
+
+        let checksum = RUST_CRC32_ISO_HDLC.checksum(&data);
+
+        assert_eq!(crc32_iso_hdlc(0xffffffff, &data) ^ 0xffffffff, checksum);
+
+        unsafe {
+            assert_eq!(
+                crc32_iso_hdlc_eor3_v9s3x2e_s3(0xffffffff, data.as_ptr(), data.len()) ^ 0xffffffff,
+                checksum
+            );
+
+            assert_eq!(
+                crc32_iso_hdlc_v12e_v1(0xffffffff, data.as_ptr(), data.len()) ^ 0xffffffff,
+                checksum
+            );
+        }
+    }
+
+    #[cfg(not(target_feature = "sha3"))]
+    fn crc32_iso_hdlc_random(len: usize) {
+        let mut data = vec![0u8; len];
+        rng().fill(&mut data[..]);
+
+        let checksum = RUST_CRC32_ISO_HDLC.checksum(&data);
+
+        assert_eq!(crc32_iso_hdlc(0xffffffff, &data) ^ 0xffffffff, checksum);
+
+        unsafe {
+            assert_eq!(
+                crc32_iso_hdlc_v12e_v1(0xffffffff, data.as_ptr(), data.len()) ^ 0xffffffff,
+                checksum
+            );
+        }
+    }
+
+    #[cfg(target_feature = "sha3")]
+    fn crc32_iscsi_random(len: usize) {
+        let mut data = vec![0u8; len];
+        rng().fill(&mut data[..]);
+
+        let checksum = RUST_CRC32_ISCSI.checksum(&data);
+
+        assert_eq!(crc32_iscsi(0xffffffff, &data) ^ 0xffffffff, checksum);
+
+        unsafe {
+            assert_eq!(
+                crc32_iscsi_eor3_v9s3x2e_s3(0xffffffff, data.as_ptr(), data.len()) ^ 0xffffffff,
+                checksum
+            );
+
+            assert_eq!(
+                crc32_iscsi_v12e_v1(0xffffffff, data.as_ptr(), data.len()) ^ 0xffffffff,
+                checksum
+            );
+        }
+    }
+
+    #[cfg(not(target_feature = "sha3"))]
+    fn crc32_iscsi_random(len: usize) {
+        let mut data = vec![0u8; len];
+        rng().fill(&mut data[..]);
+
+        let checksum = RUST_CRC32_ISCSI.checksum(&data);
+
+        assert_eq!(crc32_iscsi(0xffffffff, &data) ^ 0xffffffff, checksum);
+
+        unsafe {
+            assert_eq!(
+                crc32_iscsi_v12e_v1(0xffffffff, data.as_ptr(), data.len()) ^ 0xffffffff,
+                checksum
+            );
+        }
+    }
+}
diff --git a/src/crc32/fusion/mod.rs b/src/crc32/fusion/mod.rs
new file mode 100644
index 0000000..d75a64b
--- /dev/null
+++ b/src/crc32/fusion/mod.rs
@@ -0,0 +1,34 @@
+// Copyright 2025 Don MacAskill. Licensed under MIT or Apache-2.0.
+
+//! This module provides support for calculating CRC-32/ISO-HDLC and CRC-32/ISCSI using
+//! fusion techniques.
+//!
+//! https://www.corsix.org/content/fast-crc32c-4k
+//! https://www.corsix.org/content/alternative-exposition-crc32_4k_pclmulqdq
+//! https://dougallj.wordpress.com/2022/05/22/faster-crc32-on-the-apple-m1/
+//! https://github.com/corsix/fast-crc32/
+
+mod aarch64;
+mod x86;
+
+#[inline(always)]
+#[allow(unused)]
+pub(crate) fn crc32_iso_hdlc(state: u32, data: &[u8]) -> u32 {
+    #[cfg(target_arch = "aarch64")]
+    return aarch64::crc32_iso_hdlc(state, data);
+
+    #[cfg(not(target_arch = "aarch64"))]
+    panic!("CRC-32/ISO-HDLC with fusion is only supported on AArch64 architecture");
+}
+
+#[inline(always)]
+pub(crate) fn crc32_iscsi(state: u32, data: &[u8]) -> u32 {
+    #[cfg(target_arch = "aarch64")]
+    return aarch64::crc32_iscsi(state, data);
+
+    #[cfg(target_arch = "x86_64")]
+    return x86::crc32_iscsi(state, data);
+
+    #[cfg(all(not(target_arch = "aarch64"), not(target_arch = "x86_64")))]
+    panic!("CRC-32/ISCSI with fusion is only supported on AArch64 and X86_64 architectures");
+}
diff --git a/src/crc32/fusion/x86.rs b/src/crc32/fusion/x86.rs
new file mode 100644
index 0000000..8a68dbc
--- /dev/null
+++ b/src/crc32/fusion/x86.rs
@@ -0,0 +1,748 @@
+//! Provides CRC-32/ISCSI and CRC-32/ISO-HDLC calculations using a fusion of native CLMUL
+//! instructions and native CRC calculation instructions on x86_64.
+//!
+//! https://www.corsix.org/content/fast-crc32c-4k
+//! https://www.corsix.org/content/alternative-exposition-crc32_4k_pclmulqdq
+//!
+//! Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/
+//! with the help of Claude.ai using:
+//!
+//! ./generate -i avx512_vpclmulqdq -p crc32c -a v3x2
+//! ./generate -i avx512 -p crc32c -a v4s3x3
+//! ./generate -i sse -p crc32c -a v4s3x3
+//!
+//! Modified as necessary for this Rust implementation.
+//!
+//! MIT licensed.
+
+#![cfg(target_arch = "x86_64")]
+
+use std::arch::x86_64::*;
+
+/// Safe wrapper for CRC32 iSCSI calculation using AVX-512
+//#[rustversion::before(1.89)]
+#[inline(always)]
+#[cfg(not(feature = "vpclmulqdq"))]
+pub fn crc32_iscsi(crc: u32, data: &[u8]) -> u32 {
+    unsafe { crc32_iscsi_sse_v4s3x3(crc, data.as_ptr(), data.len()) }
+}
+
+//#[rustversion::since(1.89)]
+#[inline(always)]
+#[cfg(feature = "vpclmulqdq")]
+pub fn crc32_iscsi(crc: u32, data: &[u8]) -> u32 {
+    if is_x86_feature_detected!("vpclmulqdq")
+        && is_x86_feature_detected!("avx512f")
+        && is_x86_feature_detected!("avx512vl")
+    {
+        unsafe {
+            return crc32_iscsi_avx512_vpclmulqdq_v3x2(crc, data.as_ptr(), data.len());
+        }
+    }
+
+    if is_x86_feature_detected!("avx512f") && is_x86_feature_detected!("avx512vl") {
+        unsafe {
+            return crc32_iscsi_avx512_v4s3x3(crc, data.as_ptr(), data.len());
+        }
+    }
+
+    unsafe { crc32_iscsi_sse_v4s3x3(crc, data.as_ptr(), data.len()) }
+}
+
+//#[rustversion::since(1.89)]
+#[inline]
+#[cfg(feature = "vpclmulqdq")]
+#[target_feature(enable = "avx512f,avx512vl,vpclmulqdq")]
+unsafe fn clmul_lo_avx512_vpclmulqdq(a: __m512i, b: __m512i) -> __m512i {
+    _mm512_clmulepi64_epi128(a, b, 0)
+}
+
+//#[rustversion::since(1.89)]
+#[inline]
+#[cfg(feature = "vpclmulqdq")]
+#[target_feature(enable = "avx512f,avx512vl,vpclmulqdq")]
+unsafe fn clmul_hi_avx512_vpclmulqdq(a: __m512i, b: __m512i) -> __m512i {
+    _mm512_clmulepi64_epi128(a, b, 17)
+}
+
+#[inline]
+#[target_feature(enable = "pclmulqdq")]
+unsafe fn clmul_lo_sse(a: __m128i, b: __m128i) -> __m128i {
+    _mm_clmulepi64_si128(a, b, 0)
+}
+
+#[inline]
+#[target_feature(enable = "pclmulqdq")]
+unsafe fn clmul_hi_sse(a: __m128i, b: __m128i) -> __m128i {
+    _mm_clmulepi64_si128(a, b, 17)
+}
+
+#[inline]
+#[target_feature(enable = "pclmulqdq")]
+unsafe fn clmul_scalar_sse(a: u32, b: u32) -> __m128i {
+    _mm_clmulepi64_si128(_mm_cvtsi32_si128(a as i32), _mm_cvtsi32_si128(b as i32), 0)
+}
+
+// x^n mod P, in log(n) time
+#[target_feature(enable = "sse4.2,pclmulqdq")]
+unsafe fn xnmodp_iscsi_sse(mut n: u64) -> u32 {
+    let mut stack = !1u64;
+    let mut acc: u32;
+    let mut low: u32;
+
+    while n > 191 {
+        stack = (stack << 1) + (n & 1);
+        n = (n >> 1) - 16;
+    }
+    stack = !stack;
+    acc = 0x80000000u32 >> (n & 31);
+    n >>= 5;
+
+    while n > 0 {
+        // Use hardware CRC32C instruction
+        acc = _mm_crc32_u32(acc, 0);
+        n -= 1;
+    }
+
+    while {
+        low = (stack & 1) as u32;
+        stack >>= 1;
+        stack != 0
+    } {
+        let x = _mm_cvtsi32_si128(acc as i32);
+        let y = _mm_cvtsi128_si64(_mm_clmulepi64_si128(x, x, 0)) as u64;
+        acc = _mm_crc32_u64(0, y << low) as u32;
+    }
+    acc
+}
+
+#[inline]
+#[target_feature(enable = "pclmulqdq")]
+unsafe fn crc_shift_iscsi_sse(crc: u32, nbytes: usize) -> __m128i {
+    clmul_scalar_sse(crc, xnmodp_iscsi_sse((nbytes * 8 - 33) as u64))
+}
+
+#[inline]
+#[target_feature(enable = "sse4.1")]
+unsafe fn mm_extract_epi64(val: __m128i, idx: i32) -> u64 {
+    if idx == 0 {
+        _mm_cvtsi128_si64(val) as u64
+    } else {
+        _mm_cvtsi128_si64(_mm_srli_si128(val, 8)) as u64
+    }
+}
+
+#[inline]
+#[target_feature(enable = "sse4.2")]
+unsafe fn mm_crc32_u64(crc: u32, val: u64) -> u32 {
+    _mm_crc32_u64(crc.into(), val) as u32
+}
+
+/// Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/
+/// using:
+///
+/// ./generate -i avx512_vpclmulqdq -p crc32c -a v3x2
+//#[rustversion::since(1.89)]
+#[inline]
+#[cfg(feature = "vpclmulqdq")]
+#[target_feature(enable = "avx512f,avx512vl,vpclmulqdq,sse4.2")]
+pub unsafe fn crc32_iscsi_avx512_vpclmulqdq_v3x2(
+    mut crc0: u32,
+    mut buf: *const u8,
+    mut len: usize,
+) -> u32 {
+    // Align to 8-byte boundary
+    while len > 0 && (buf as usize & 7) != 0 {
+        crc0 = _mm_crc32_u8(crc0, *buf);
+        buf = buf.add(1);
+        len -= 1;
+    }
+
+    // Align to 64-byte boundary (cache line)
+    while (buf as usize & 56) != 0 && len >= 8 {
+        crc0 = _mm_crc32_u64(crc0.into(), *(buf as *const u64)) as u32;
+        buf = buf.add(8);
+        len -= 8;
+    }
+
+    if len >= 384 {
+        // First vector chunk - load three 512-bit vectors (192 bytes total)
+        let mut x0 = _mm512_loadu_si512(buf as *const __m512i);
+        let mut x1 = _mm512_loadu_si512(buf.add(64) as *const __m512i);
+        let mut x2 = _mm512_loadu_si512(buf.add(128) as *const __m512i);
+
+        // Create the multiplication constant vector
+        // Pattern: [0xa87ab8a8, 0, 0xab7aff2a, 0] repeated across all 128-bit lanes
+        let k_128 = _mm_setr_epi32(0xa87ab8a8u32 as i32, 0, 0xab7aff2au32 as i32, 0);
+        let mut k = _mm512_broadcast_i32x4(k_128);
+
+        // XOR the CRC into the first vector's low 32 bits
+        let crc_vec = _mm512_castsi128_si512(_mm_cvtsi32_si128(crc0 as i32));
+        x0 = _mm512_xor_si512(crc_vec, x0);
+
+        // First round of polynomial multiplication
+        let mut y0 = clmul_lo_avx512_vpclmulqdq(x0, k);
+        x0 = clmul_hi_avx512_vpclmulqdq(x0, k);
+        let mut y1 = clmul_lo_avx512_vpclmulqdq(x1, k);
+        x1 = clmul_hi_avx512_vpclmulqdq(x1, k);
+        let mut y2 = clmul_lo_avx512_vpclmulqdq(x2, k);
+        x2 = clmul_hi_avx512_vpclmulqdq(x2, k);
+
+        // XOR with next chunk of data using ternary logic (A XOR B XOR C)
+        // 0x96 = A XOR B XOR C in ternary logic notation
+        x0 = _mm512_ternarylogic_epi64(
+            x0,
+            y0,
+            _mm512_loadu_si512(buf.add(192) as *const __m512i),
+            0x96,
+        );
+        x1 = _mm512_ternarylogic_epi64(
+            x1,
+            y1,
+            _mm512_loadu_si512(buf.add(256) as *const __m512i),
+            0x96,
+        );
+        x2 = _mm512_ternarylogic_epi64(
+            x2,
+            y2,
+            _mm512_loadu_si512(buf.add(320) as *const __m512i),
+            0x96,
+        );
+
+        buf = buf.add(384);
+        len -= 384;
+
+        // Main loop - process 384 bytes at a time
+        while len >= 384 {
+            // First folding step
+            y0 = clmul_lo_avx512_vpclmulqdq(x0, k);
+            x0 = clmul_hi_avx512_vpclmulqdq(x0, k);
+            y1 = clmul_lo_avx512_vpclmulqdq(x1, k);
+            x1 = clmul_hi_avx512_vpclmulqdq(x1, k);
+            y2 = clmul_lo_avx512_vpclmulqdq(x2, k);
+            x2 = clmul_hi_avx512_vpclmulqdq(x2, k);
+
+            x0 = _mm512_ternarylogic_epi64(x0, y0, _mm512_loadu_si512(buf as *const __m512i), 0x96);
+            x1 = _mm512_ternarylogic_epi64(
+                x1,
+                y1,
+                _mm512_loadu_si512(buf.add(64) as *const __m512i),
+                0x96,
+            );
+            x2 = _mm512_ternarylogic_epi64(
+                x2,
+                y2,
+                _mm512_loadu_si512(buf.add(128) as *const __m512i),
+                0x96,
+            );
+
+            // Second folding step
+            y0 = clmul_lo_avx512_vpclmulqdq(x0, k);
+            x0 = clmul_hi_avx512_vpclmulqdq(x0, k);
+            y1 = clmul_lo_avx512_vpclmulqdq(x1, k);
+            x1 = clmul_hi_avx512_vpclmulqdq(x1, k);
+            y2 = clmul_lo_avx512_vpclmulqdq(x2, k);
+            x2 = clmul_hi_avx512_vpclmulqdq(x2, k);
+
+            x0 = _mm512_ternarylogic_epi64(
+                x0,
+                y0,
+                _mm512_loadu_si512(buf.add(192) as *const __m512i),
+                0x96,
+            );
+            x1 = _mm512_ternarylogic_epi64(
+                x1,
+                y1,
+                _mm512_loadu_si512(buf.add(256) as *const __m512i),
+                0x96,
+            );
+            x2 = _mm512_ternarylogic_epi64(
+                x2,
+                y2,
+                _mm512_loadu_si512(buf.add(320) as *const __m512i),
+                0x96,
+            );
+
+            buf = buf.add(384);
+            len -= 384;
+        }
+
+        // Reduce x0, x1, x2 to just x0
+        let k_128 = _mm_setr_epi32(0x740eef02u32 as i32, 0, 0x9e4addf8u32 as i32, 0);
+        k = _mm512_broadcast_i32x4(k_128);
+
+        y0 = clmul_lo_avx512_vpclmulqdq(x0, k);
+        x0 = clmul_hi_avx512_vpclmulqdq(x0, k);
+        x0 = _mm512_ternarylogic_epi64(x0, y0, x1, 0x96);
+        x1 = x2;
+
+        y0 = clmul_lo_avx512_vpclmulqdq(x0, k);
+        x0 = clmul_hi_avx512_vpclmulqdq(x0, k);
+        x0 = _mm512_ternarylogic_epi64(x0, y0, x1, 0x96);
+
+        // Reduce 512 bits to 128 bits
+        // Multiple reduction constants for different parts of the 512-bit vector
+        k = _mm512_setr_epi32(
+            0x1c291d04u32 as i32,
+            0,
+            0xddc0152bu32 as i32,
+            0, // Lane 0
+            0x3da6d0cbu32 as i32,
+            0,
+            0xba4fc28eu32 as i32,
+            0, // Lane 1
+            0xf20c0dfeu32 as i32,
+            0,
+            0x493c7d27u32 as i32,
+            0, // Lane 2
+            0,
+            0,
+            0,
+            0, // Lane 3 (unused)
+        );
+
+        y0 = clmul_lo_avx512_vpclmulqdq(x0, k);
+        k = clmul_hi_avx512_vpclmulqdq(x0, k);
+        y0 = _mm512_xor_si512(y0, k);
+
+        // Extract 128-bit lanes and combine them
+        let lane0 = _mm512_castsi512_si128(y0);
+        let lane1 = _mm512_extracti32x4_epi32(y0, 1);
+        let lane2 = _mm512_extracti32x4_epi32(y0, 2);
+        let lane3 = _mm512_extracti32x4_epi32(x0, 3);
+
+        // Combine all lanes using ternary logic
+        let mut z0 = _mm_ternarylogic_epi64(lane0, lane1, lane2, 0x96);
+        z0 = _mm_xor_si128(z0, lane3);
+
+        // Reduce 128 bits to 32 bits using CRC32 instructions
+        crc0 = _mm_crc32_u64(0, _mm_extract_epi64(z0, 0) as u64) as u32;
+        crc0 = _mm_crc32_u64(crc0.into(), _mm_extract_epi64(z0, 1) as u64) as u32;
+    }
+
+    // Process remaining 8-byte chunks
+    while len >= 8 {
+        crc0 = _mm_crc32_u64(crc0.into(), *(buf as *const u64)) as u32;
+        buf = buf.add(8);
+        len -= 8;
+    }
+
+    // Process remaining bytes
+    while len > 0 {
+        crc0 = _mm_crc32_u8(crc0, *buf);
+        buf = buf.add(1);
+        len -= 1;
+    }
+
+    crc0
+}
+
+/// Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/
+/// using:
+///
+/// ./generate -i avx512 -p crc32c -a v4s3x3
+//#[rustversion::since(1.89)]
+#[inline]
+#[cfg(feature = "vpclmulqdq")]
+#[target_feature(enable = "avx2,avx512f,avx512vl,pclmulqdq")]
+pub unsafe fn crc32_iscsi_avx512_v4s3x3(mut crc0: u32, mut buf: *const u8, mut len: usize) -> u32 {
+    // Align to 8-byte boundary using hardware CRC32C instructions
+    while len > 0 && (buf as usize & 7) != 0 {
+        crc0 = _mm_crc32_u8(crc0, *buf);
+        buf = buf.add(1);
+        len -= 1;
+    }
+
+    // Handle 8-byte alignment
+    if (buf as usize & 8) != 0 && len >= 8 {
+        crc0 = _mm_crc32_u64(crc0.into(), *(buf as *const u64)) as u32;
+        buf = buf.add(8);
+        len -= 8;
+    }
+
+    if len >= 144 {
+        let blk = (len - 8) / 136;
+        let klen = blk * 24;
+        let buf2 = buf;
+        let mut crc1 = 0u32;
+        let mut crc2 = 0u32;
+
+        // First vector chunk - load four 128-bit vectors (64 bytes total)
+        let mut x0 = _mm_loadu_si128(buf2 as *const __m128i);
+        let mut x1 = _mm_loadu_si128(buf2.add(16) as *const __m128i);
+        let mut x2 = _mm_loadu_si128(buf2.add(32) as *const __m128i);
+        let mut x3 = _mm_loadu_si128(buf2.add(48) as *const __m128i);
+
+        // iSCSI-specific folding constant (different from ISO-HDLC)
+        let mut k = _mm_setr_epi32(0x740eef02u32 as i32, 0, 0x9e4addf8u32 as i32, 0);
+
+        // XOR the CRC into the first vector's low 32 bits
+        x0 = _mm_xor_si128(_mm_cvtsi32_si128(crc0 as i32), x0);
+        crc0 = 0;
+
+        let mut buf2 = buf2.add(64);
+        len -= 136;
+        buf = buf.add(blk * 64);
+
+        // Main loop - process 144 bytes at a time
+        while len >= 144 {
+            let y0 = clmul_lo_sse(x0, k);
+            x0 = clmul_hi_sse(x0, k);
+            let y1 = clmul_lo_sse(x1, k);
+            x1 = clmul_hi_sse(x1, k);
+            let y2 = clmul_lo_sse(x2, k);
+            x2 = clmul_hi_sse(x2, k);
+            let y3 = clmul_lo_sse(x3, k);
+            x3 = clmul_hi_sse(x3, k);
+
+            // XOR with next chunk of data using ternary logic (A XOR B XOR C)
+            x0 = _mm_ternarylogic_epi64(x0, y0, _mm_loadu_si128(buf2 as *const __m128i), 0x96);
+            x1 = _mm_ternarylogic_epi64(
+                x1,
+                y1,
+                _mm_loadu_si128(buf2.add(16) as *const __m128i),
+                0x96,
+            );
+            x2 = _mm_ternarylogic_epi64(
+                x2,
+                y2,
+                _mm_loadu_si128(buf2.add(32) as *const __m128i),
+                0x96,
+            );
+            x3 = _mm_ternarylogic_epi64(
+                x3,
+                y3,
+                _mm_loadu_si128(buf2.add(48) as *const __m128i),
+                0x96,
+            );
+
+            // Process scalar data in parallel using hardware CRC32C
+            crc0 = _mm_crc32_u64(crc0.into(), *(buf as *const u64)) as u32;
+            crc1 = _mm_crc32_u64(crc1.into(), *(buf.add(klen) as *const u64)) as u32;
+            crc2 = _mm_crc32_u64(crc2.into(), *(buf.add(klen * 2) as *const u64)) as u32;
+            crc0 = _mm_crc32_u64(crc0.into(), *(buf.add(8) as *const u64)) as u32;
+            crc1 = _mm_crc32_u64(crc1.into(), *(buf.add(klen + 8) as *const u64)) as u32;
+            crc2 = _mm_crc32_u64(crc2.into(), *(buf.add(klen * 2 + 8) as *const u64)) as u32;
+            crc0 = _mm_crc32_u64(crc0.into(), *(buf.add(16) as *const u64)) as u32;
+            crc1 = _mm_crc32_u64(crc1.into(), *(buf.add(klen + 16) as *const u64)) as u32;
+            crc2 = _mm_crc32_u64(crc2.into(), *(buf.add(klen * 2 + 16) as *const u64)) as u32;
+
+            buf = buf.add(24);
+            buf2 = buf2.add(64);
+            len -= 136;
+        }
+
+        // Reduce x0 ... x3 to just x0 using iSCSI-specific constants
+        k = _mm_setr_epi32(0xf20c0dfeu32 as i32, 0, 0x493c7d27u32 as i32, 0);
+
+        let y0 = clmul_lo_sse(x0, k);
+        x0 = clmul_hi_sse(x0, k);
+        let y2 = clmul_lo_sse(x2, k);
+        x2 = clmul_hi_sse(x2, k);
+
+        x0 = _mm_ternarylogic_epi64(x0, y0, x1, 0x96);
+        x2 = _mm_ternarylogic_epi64(x2, y2, x3, 0x96);
+
+        k = _mm_setr_epi32(0x3da6d0cbu32 as i32, 0, 0xba4fc28eu32 as i32, 0);
+
+        let y0 = clmul_lo_sse(x0, k);
+        x0 = clmul_hi_sse(x0, k);
+        x0 = _mm_ternarylogic_epi64(x0, y0, x2, 0x96);
+
+        // Final scalar chunk
+        crc0 = _mm_crc32_u64(crc0.into(), *(buf as *const u64)) as u32;
+        crc1 = _mm_crc32_u64(crc1.into(), *(buf.add(klen) as *const u64)) as u32;
+        crc2 = _mm_crc32_u64(crc2.into(), *(buf.add(klen * 2) as *const u64)) as u32;
+        crc0 = _mm_crc32_u64(crc0.into(), *(buf.add(8) as *const u64)) as u32;
+        crc1 = _mm_crc32_u64(crc1.into(), *(buf.add(klen + 8) as *const u64)) as u32;
+        crc2 = _mm_crc32_u64(crc2.into(), *(buf.add(klen * 2 + 8) as *const u64)) as u32;
+        crc0 = _mm_crc32_u64(crc0.into(), *(buf.add(16) as *const u64)) as u32;
+        crc1 = _mm_crc32_u64(crc1.into(), *(buf.add(klen + 16) as *const u64)) as u32;
+        crc2 = _mm_crc32_u64(crc2.into(), *(buf.add(klen * 2 + 16) as *const u64)) as u32;
+        buf = buf.add(24);
+
+        let vc0 = crc_shift_iscsi_sse(crc0, klen * 2 + 8);
+        let vc1 = crc_shift_iscsi_sse(crc1, klen + 8);
+        let mut vc = _mm_extract_epi64(_mm_xor_si128(vc0, vc1), 0) as u64;
+
+        // Reduce 128 bits to 32 bits, and multiply by x^32
+        let x0_low = _mm_extract_epi64(x0, 0) as u64;
+        let x0_high = _mm_extract_epi64(x0, 1) as u64;
+        let vec_crc = _mm_crc32_u64(_mm_crc32_u64(0, x0_low), x0_high);
+        vc ^= _mm_extract_epi64(crc_shift_iscsi_sse(vec_crc as u32, klen * 3 + 8), 0) as u64;
+
+        // Final 8 bytes
+        buf = buf.add(klen * 2);
+        crc0 = crc2;
+        crc0 = _mm_crc32_u64(crc0.into(), *(buf as *const u64) ^ vc) as u32;
+        buf = buf.add(8);
+        len -= 8;
+    }
+
+    // Process remaining 8-byte chunks using hardware CRC32C
+    while len >= 8 {
+        crc0 = _mm_crc32_u64(crc0.into(), *(buf as *const u64)) as u32;
+        buf = buf.add(8);
+        len -= 8;
+    }
+
+    // Process remaining bytes using hardware CRC32C
+    while len > 0 {
+        crc0 = _mm_crc32_u8(crc0, *buf);
+        buf = buf.add(1);
+        len -= 1;
+    }
+
+    crc0
+}
+
+/// Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/
+/// using:
+///
+/// ./generate -i sse -p crc32c -a v4s3x3
+#[inline]
+#[target_feature(enable = "sse4.2,pclmulqdq")]
+pub unsafe fn crc32_iscsi_sse_v4s3x3(mut crc0: u32, mut buf: *const u8, mut len: usize) -> u32 {
+    // Align to 8-byte boundary using hardware CRC32C instructions
+    while len > 0 && (buf as usize & 7) != 0 {
+        crc0 = _mm_crc32_u8(crc0, *buf);
+        buf = buf.add(1);
+        len -= 1;
+    }
+
+    // Handle 8-byte alignment
+    if (buf as usize & 8) != 0 && len >= 8 {
+        crc0 = mm_crc32_u64(crc0, *(buf as *const u64));
+        buf = buf.add(8);
+        len -= 8;
+    }
+
+    if len >= 144 {
+        let blk = (len - 8) / 136;
+        let klen = blk * 24;
+        let buf2 = buf;
+        let mut crc1 = 0u32;
+        let mut crc2 = 0u32;
+
+        // First vector chunk - load four 128-bit vectors (64 bytes total)
+        let mut x0 = _mm_loadu_si128(buf2 as *const __m128i);
+        let mut x1 = _mm_loadu_si128(buf2.add(16) as *const __m128i);
+        let mut x2 = _mm_loadu_si128(buf2.add(32) as *const __m128i);
+        let mut x3 = _mm_loadu_si128(buf2.add(48) as *const __m128i);
+
+        // iSCSI-specific folding constant (same as AVX-512 version)
+        let mut k = _mm_setr_epi32(0x740eef02u32 as i32, 0, 0x9e4addf8u32 as i32, 0);
+
+        // XOR the CRC into the first vector's low 32 bits
+        x0 = _mm_xor_si128(_mm_cvtsi32_si128(crc0 as i32), x0);
+        crc0 = 0;
+
+        let mut buf2 = buf2.add(64);
+        len -= 136;
+        buf = buf.add(blk * 64);
+
+        // Main loop - process 144 bytes at a time
+        while len >= 144 {
+            let mut y0 = clmul_lo_sse(x0, k);
+            x0 = clmul_hi_sse(x0, k);
+            let mut y1 = clmul_lo_sse(x1, k);
+            x1 = clmul_hi_sse(x1, k);
+            let mut y2 = clmul_lo_sse(x2, k);
+            x2 = clmul_hi_sse(x2, k);
+            let mut y3 = clmul_lo_sse(x3, k);
+            x3 = clmul_hi_sse(x3, k);
+
+            // XOR operations using separate XOR instructions (no ternary logic in SSE)
+            y0 = _mm_xor_si128(y0, _mm_loadu_si128(buf2 as *const __m128i));
+            x0 = _mm_xor_si128(x0, y0);
+            y1 = _mm_xor_si128(y1, _mm_loadu_si128(buf2.add(16) as *const __m128i));
+            x1 = _mm_xor_si128(x1, y1);
+            y2 = _mm_xor_si128(y2, _mm_loadu_si128(buf2.add(32) as *const __m128i));
+            x2 = _mm_xor_si128(x2, y2);
+            y3 = _mm_xor_si128(y3, _mm_loadu_si128(buf2.add(48) as *const __m128i));
+            x3 = _mm_xor_si128(x3, y3);
+
+            // Process scalar data in parallel using hardware CRC32C
+            crc0 = mm_crc32_u64(crc0, *(buf as *const u64));
+            crc1 = mm_crc32_u64(crc1, *(buf.add(klen) as *const u64));
+            crc2 = mm_crc32_u64(crc2, *(buf.add(klen * 2) as *const u64));
+            crc0 = mm_crc32_u64(crc0, *(buf.add(8) as *const u64));
+            crc1 = mm_crc32_u64(crc1, *(buf.add(klen + 8) as *const u64));
+            crc2 = mm_crc32_u64(crc2, *(buf.add(klen * 2 + 8) as *const u64));
+            crc0 = mm_crc32_u64(crc0, *(buf.add(16) as *const u64));
+            crc1 = mm_crc32_u64(crc1, *(buf.add(klen + 16) as *const u64));
+            crc2 = mm_crc32_u64(crc2, *(buf.add(klen * 2 + 16) as *const u64));
+
+            buf = buf.add(24);
+            buf2 = buf2.add(64);
+            len -= 136;
+        }
+
+        // Reduce x0 ... x3 to just x0 using iSCSI-specific constants
+        k = _mm_setr_epi32(0xf20c0dfeu32 as i32, 0, 0x493c7d27u32 as i32, 0);
+
+        let mut y0 = clmul_lo_sse(x0, k);
+        x0 = clmul_hi_sse(x0, k);
+        let mut y2 = clmul_lo_sse(x2, k);
+        x2 = clmul_hi_sse(x2, k);
+
+        y0 = _mm_xor_si128(y0, x1);
+        x0 = _mm_xor_si128(x0, y0);
+        y2 = _mm_xor_si128(y2, x3);
+        x2 = _mm_xor_si128(x2, y2);
+
+        k = _mm_setr_epi32(0x3da6d0cbu32 as i32, 0, 0xba4fc28eu32 as i32, 0);
+
+        y0 = clmul_lo_sse(x0, k);
+        x0 = clmul_hi_sse(x0, k);
+        y0 = _mm_xor_si128(y0, x2);
+        x0 = _mm_xor_si128(x0, y0);
+
+        // Final scalar chunk
+        crc0 = mm_crc32_u64(crc0, *(buf as *const u64));
+        crc1 = mm_crc32_u64(crc1, *(buf.add(klen) as *const u64));
+        crc2 = mm_crc32_u64(crc2, *(buf.add(klen * 2) as *const u64));
+        crc0 = mm_crc32_u64(crc0, *(buf.add(8) as *const u64));
+        crc1 = mm_crc32_u64(crc1, *(buf.add(klen + 8) as *const u64));
+        crc2 = mm_crc32_u64(crc2, *(buf.add(klen * 2 + 8) as *const u64));
+        crc0 = mm_crc32_u64(crc0, *(buf.add(16) as *const u64));
+        crc1 = mm_crc32_u64(crc1, *(buf.add(klen + 16) as *const u64));
+        crc2 = mm_crc32_u64(crc2, *(buf.add(klen * 2 + 16) as *const u64));
+        buf = buf.add(24);
+
+        let vc0 = crc_shift_iscsi_sse(crc0, klen * 2 + 8);
+        let vc1 = crc_shift_iscsi_sse(crc1, klen + 8);
+        let mut vc = mm_extract_epi64(_mm_xor_si128(vc0, vc1), 0);
+
+        // Reduce 128 bits to 32 bits, and multiply by x^32
+        // Extract the two 64-bit parts of x0 and combine them
+        let x0_low = mm_extract_epi64(x0, 0);
+        let x0_high = mm_extract_epi64(x0, 1);
+        let x0_combined = mm_extract_epi64(
+            crc_shift_iscsi_sse(mm_crc32_u64(mm_crc32_u64(0, x0_low), x0_high), klen * 3 + 8),
+            0,
+        );
+        vc ^= x0_combined;
+
+        // Final 8 bytes
+        buf = buf.add(klen * 2);
+        crc0 = crc2;
+        crc0 = mm_crc32_u64(crc0, *(buf as *const u64) ^ vc);
+        buf = buf.add(8);
+        len -= 8;
+    }
+
+    // Process remaining 8-byte chunks using hardware CRC32C
+    while len >= 8 {
+        crc0 = mm_crc32_u64(crc0, *(buf as *const u64));
+        buf = buf.add(8);
+        len -= 8;
+    }
+
+    // Process remaining bytes using hardware CRC32C
+    while len > 0 {
+        crc0 = _mm_crc32_u8(crc0, *buf);
+        buf = buf.add(1);
+        len -= 1;
+    }
+
+    crc0
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::test::consts::TEST_CHECK_STRING;
+    use crc::{Crc, Table};
+    use rand::{rng, Rng};
+
+    const RUST_CRC32_ISCSI: Crc<u32, Table<16>> = Crc::<u32, Table<16>>::new(&crc::CRC_32_ISCSI);
+
+    #[test]
+    fn test_crc32_iscsi_check() {
+        assert_eq!(
+            crc32_iscsi(0xffffffff, TEST_CHECK_STRING) ^ 0xffffffff,
+            0xe3069283
+        );
+    }
+
+    #[test]
+    fn test_crc32_iscsi_small_all_lengths() {
+        for len in 1..=255 {
+            test_crc32_iscsi_random(len);
+        }
+    }
+
+    #[test]
+    fn test_crc32_iscsi_medium_lengths() {
+        // Test each length from 256 to 1024, which should fold and include handling remainders
+        for len in 256..=1024 {
+            test_crc32_iscsi_random(len);
+        }
+    }
+
+    #[test]
+    fn test_crc32_iscsi_large_lengths() {
+        // Test 1 MiB just before, at, and just after the folding boundaries
+        for len in 1048575..1048577 {
+            test_crc32_iscsi_random(len);
+        }
+    }
+
+    //#[rustversion::since(1.89)]
+    #[cfg(feature = "vpclmulqdq")]
+    fn test_crc32_iscsi_random(len: usize) {
+        let mut data = vec![0u8; len];
+        rng().fill(&mut data[..]);
+
+        let checksum = RUST_CRC32_ISCSI.checksum(&data);
+
+        assert_eq!(crc32_iscsi(0xffffffff, &data) ^ 0xffffffff, checksum);
+
+        unsafe {
+            if is_x86_feature_detected!("vpclmulqdq")
+                && is_x86_feature_detected!("avx512vl")
+                && is_x86_feature_detected!("avx512f")
+            {
+                assert_eq!(
+                    crc32_iscsi_avx512_vpclmulqdq_v3x2(0xffffffff, data.as_ptr(), data.len())
+                        ^ 0xffffffff,
+                    checksum
+                );
+            }
+
+            if is_x86_feature_detected!("avx512vl")
+                && is_x86_feature_detected!("avx512f")
+                && is_x86_feature_detected!("pclmulqdq")
+            {
+                assert_eq!(
+                    crc32_iscsi_avx512_v4s3x3(0xffffffff, data.as_ptr(), data.len()) ^ 0xffffffff,
+                    checksum
+                );
+            }
+
+            assert_eq!(
+                crc32_iscsi_sse_v4s3x3(0xffffffff, data.as_ptr(), data.len()) ^ 0xffffffff,
+                checksum
+            );
+        }
+    }
+
+    //#[rustversion::before(1.89)]
+    #[cfg(not(feature = "vpclmulqdq"))]
+    fn test_crc32_iscsi_random(len: usize) {
+        let mut data = vec![0u8; len];
+        rng().fill(&mut data[..]);
+
+        let checksum = RUST_CRC32_ISCSI.checksum(&data);
+
+        assert_eq!(crc32_iscsi(0xffffffff, &data) ^ 0xffffffff, checksum);
+
+        unsafe {
+            assert_eq!(
+                crc32_iscsi_sse_v4s3x3(0xffffffff, data.as_ptr(), data.len()) ^ 0xffffffff,
+                checksum
+            );
+        }
+    }
+}
diff --git a/src/crc32/mod.rs b/src/crc32/mod.rs
index fba820b..518f5f2 100644
--- a/src/crc32/mod.rs
+++ b/src/crc32/mod.rs
@@ -1,6 +1,9 @@
 // Copyright 2025 Don MacAskill. Licensed under MIT or Apache-2.0.
 
-//! This module provides CRC32 support.
+//! This module provides CRC-32 support.
 
 pub mod algorithm;
 pub mod consts;
+
+#[cfg(any(target_arch = "x86_64", target_arch = "aarch64"))]
+pub(crate) mod fusion;
diff --git a/src/crc64/algorithm.rs b/src/crc64/algorithm.rs
index 49cd301..5900585 100644
--- a/src/crc64/algorithm.rs
+++ b/src/crc64/algorithm.rs
@@ -206,9 +206,9 @@ impl EnhancedCrcWidth for crate::structs::Width64 {
 #[inline]
 #[cfg_attr(
     any(target_arch = "x86", target_arch = "x86_64"),
-    target_feature(enable = "sse2,sse4.1,pclmulqdq")
+    target_feature(enable = "ssse3,sse4.1,pclmulqdq")
 )]
-#[cfg_attr(target_arch = "aarch64", target_feature(enable = "neon,aes"))]
+#[cfg_attr(target_arch = "aarch64", target_feature(enable = "aes"))]
 pub(crate) unsafe fn process_0_to_15<T: ArchOps, W: EnhancedCrcWidth>(
     data: &[u8],
     state: &mut CrcState<T::Vector>,
diff --git a/src/crc64/mod.rs b/src/crc64/mod.rs
index 4f86b2c..44eee30 100644
--- a/src/crc64/mod.rs
+++ b/src/crc64/mod.rs
@@ -1,6 +1,6 @@
 // Copyright 2025 Don MacAskill. Licensed under MIT or Apache-2.0.
 
-//! This module provides CRC64 support.
+//! This module provides CRC-64 support.
 
 pub mod algorithm;
 pub mod consts;
diff --git a/src/crc64/utils.rs b/src/crc64/utils.rs
index 8aa79fa..6e5d58b 100644
--- a/src/crc64/utils.rs
+++ b/src/crc64/utils.rs
@@ -15,7 +15,7 @@ use std::arch::x86_64::*;
 
 #[cfg(target_arch = "aarch64")]
 #[allow(dead_code)]
-#[target_feature(enable = "neon,aes")]
+#[target_feature(enable = "aes")]
 pub(crate) unsafe fn print_xmm_hex(prefix: &str, xmm: uint8x16_t) {
     let mut temp = [0u64; 2];
     vst1q_u64(temp.as_mut_ptr(), vreinterpretq_u64_u8(xmm));
@@ -24,7 +24,7 @@ pub(crate) unsafe fn print_xmm_hex(prefix: &str, xmm: uint8x16_t) {
 
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 #[allow(dead_code)]
-#[target_feature(enable = "sse2,sse4.1")]
+#[target_feature(enable = "sse4.1")]
 pub(crate) unsafe fn print_xmm_hex(prefix: &str, xmm: __m128i) {
     let mut temp = [0u64; 2];
     _mm_storeu_si128(temp.as_mut_ptr() as *mut __m128i, xmm);
diff --git a/src/lib.rs b/src/lib.rs
index 6b66cc5..b4f7a8e 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -60,8 +60,8 @@
 //! use crc_fast::{Digest, CrcAlgorithm::Crc32IsoHdlc};
 //!
 //! // for example/test purposes only, use your own file path
-//! let binding = env::current_dir().expect("missing working dir").join("crc-check.txt");
-//! let file_on_disk = binding.to_str().unwrap();
+//! let file_path = env::current_dir().expect("missing working dir").join("crc-check.txt");
+//! let file_on_disk = file_path.to_str().unwrap();
 //!
 //! // actual usage
 //! let mut digest = Digest::new(Crc32IsoHdlc);
@@ -97,24 +97,22 @@
 //! use crc_fast::{checksum_file, CrcAlgorithm::Crc32IsoHdlc};
 //!
 //! // for example/test purposes only, use your own file path
-//! let binding = env::current_dir().expect("missing working dir").join("crc-check.txt");
-//! let file_on_disk = binding.to_str().unwrap();
+//! let file_path = env::current_dir().expect("missing working dir").join("crc-check.txt");
+//! let file_on_disk = file_path.to_str().unwrap();
 //!
 //! let checksum = checksum_file(Crc32IsoHdlc, file_on_disk, None);
 //!
 //! assert_eq!(checksum.unwrap(), 0xcbf43926);
 //! ```
 
-// if VPCLMULQDQ or AVX512 is enabled, enable extra AVX512 features
-#![cfg_attr(
-    any(feature = "vpclmulqdq", feature = "avx512"),
-    feature(stdarch_x86_avx512)
-)]
-
 use crate::crc32::consts::{
     CRC32_AIXM, CRC32_AUTOSAR, CRC32_BASE91_D, CRC32_BZIP2, CRC32_CD_ROM_EDC, CRC32_CKSUM,
     CRC32_ISCSI, CRC32_ISO_HDLC, CRC32_JAMCRC, CRC32_MEF, CRC32_MPEG_2, CRC32_XFER,
 };
+
+#[cfg(any(target_arch = "x86_64", target_arch = "aarch64"))]
+use crate::crc32::fusion;
+
 use crate::crc64::consts::{
     CRC64_ECMA_182, CRC64_GO_ISO, CRC64_MS, CRC64_NVME, CRC64_REDIS, CRC64_WE, CRC64_XZ,
 };
@@ -126,7 +124,6 @@ use std::io::{Read, Write};
 
 mod algorithm;
 mod arch;
-mod bindings;
 mod combine;
 mod consts;
 mod crc32;
@@ -379,8 +376,8 @@ pub fn checksum(algorithm: CrcAlgorithm, buf: &[u8]) -> u64 {
 /// use crc_fast::{checksum_file, CrcAlgorithm::Crc32IsoHdlc};
 ///
 /// // for example/test purposes only, use your own file path
-/// let binding = env::current_dir().expect("missing working dir").join("crc-check.txt");
-/// let file_on_disk = binding.to_str().unwrap();
+/// let file_path = env::current_dir().expect("missing working dir").join("crc-check.txt");
+/// let file_on_disk = file_path.to_str().unwrap();
 ///
 /// let checksum = checksum_file(Crc32IsoHdlc, file_on_disk, None);
 ///
@@ -440,32 +437,17 @@ pub fn checksum_combine(
 
 /// Returns the target used to calculate the CRC checksum for the specified algorithm.
 ///
+/// These strings are informational only, not stable, and shouldn't be relied on to match across
+/// versions.
+///
 /// # Examples
 ///```rust
 /// use crc_fast::{get_calculator_target, CrcAlgorithm::Crc32IsoHdlc};
 ///
 /// let target = get_calculator_target(Crc32IsoHdlc);
 /// ```
-pub fn get_calculator_target(algorithm: CrcAlgorithm) -> String {
-    match algorithm {
-        CrcAlgorithm::Crc32IsoHdlc => {
-            #[cfg(optimized_crc32_iso_hdlc)]
-            unsafe {
-                bindings::get_iso_hdlc_target()
-            }
-            #[cfg(not(optimized_crc32_iso_hdlc))]
-            arch::get_target()
-        }
-        CrcAlgorithm::Crc32Iscsi => {
-            #[cfg(optimized_crc32_iscsi)]
-            unsafe {
-                bindings::get_iscsi_target()
-            }
-            #[cfg(not(optimized_crc32_iscsi))]
-            arch::get_target()
-        }
-        _ => arch::get_target(),
-    }
+pub fn get_calculator_target(_algorithm: CrcAlgorithm) -> String {
+    arch::get_target()
 }
 
 /// Returns the calculator function and parameters for the specified CRC algorithm.
@@ -496,53 +478,34 @@ fn get_calculator_params(algorithm: CrcAlgorithm) -> (CalculatorFn, CrcParams) {
 
 /// Calculates the CRC-32/ISCSI ("crc32c" in many, but not all, implementations) checksum.
 ///
-/// By default, uses an external optimized C implementation, but can be switched to an internal
-/// SIMD-only implementation by using the `internal_simd_only` feature flag.
-///
-/// The external optimized implementation is also tunable via feature flags.
+/// Because both aarch64 and x86 have native hardware support for CRC-32/ISCSI, we can use
+/// fusion techniques to accelerate the calculation beyond what SIMD can do alone.
 #[inline(always)]
-fn crc32_iscsi_calculator(state: u64, data: &[u8], params: CrcParams) -> u64 {
-    #[cfg(optimized_crc32_iscsi)]
-    {
-        bindings::crc32_iscsi(state, data, params)
-    }
-
-    #[cfg(not(optimized_crc32_iscsi))]
-    {
-        Calculator::calculate(state, data, params)
-    }
+fn crc32_iscsi_calculator(state: u64, data: &[u8], _params: CrcParams) -> u64 {
+    // both aarch64 and x86 have native CRC-32/ISCSI support, so we can use fusion
+    #[cfg(any(target_arch = "aarch64", target_arch = "x86_64"))]
+    return fusion::crc32_iscsi(state as u32, data) as u64;
+
+    #[cfg(all(not(target_arch = "aarch64"), not(target_arch = "x86_64")))]
+    // fallback to traditional calculation if not aarch64 or x86_64
+    Calculator::calculate(state, data, _params)
 }
 
 /// Calculates the CRC-32/ISO-HDLC ("crc32" in many, but not all, implementations) checksum.
 ///
-/// By default, uses an external optimized C implementation, but can be switched to an internal
-/// SIMD-only implementation by using the `internal_simd_only` feature flag.
-///
-/// The external optimized implementation is also tunable via feature flags.#[inline(always)]
-fn crc32_iso_hdlc_calculator(state: u64, data: &[u8], params: CrcParams) -> u64 {
-    #[cfg(optimized_crc32_iso_hdlc)]
-    {
-        // Call the FFI function for CRC-32/ISO-HDLC for large (>1KiB) data payloads
-        #[cfg(target_arch = "x86_64")]
-        {
-            if data.len() > 1024 && std::arch::is_x86_feature_detected!("vpclmulqdq") {
-                return bindings::crc32_iso_hdlc(state, data, params);
-            }
-
-            // our internal SIMD implementation for small (<1KiB) data payloads is faster,
-            // only for CRC-32/ISO_HDLC on non-VPCLMULQDQ platforms
-            Calculator::calculate(state, data, params)
-        }
-
-        #[cfg(not(target_arch = "x86_64"))]
-        // Call the FFI function for CRC-32/ISO-HDLC for all payloads non-x86_64
-        return bindings::crc32_iso_hdlc(state, data, params);
-    }
-
-    #[cfg(not(optimized_crc32_iso_hdlc))]
-    {
-        Calculator::calculate(state, data, params)
-    }
+/// Because aarch64 has native hardware support for CRC-32/ISO-HDLC, we can use fusion techniques
+/// to accelerate the calculation beyond what SIMD can do alone. x86 does not have native support,
+/// so we use the traditional calculation.
+#[inline(always)]
+fn crc32_iso_hdlc_calculator(state: u64, data: &[u8], _params: CrcParams) -> u64 {
+    // aarch64 CPUs have native CRC-32/ISO-HDLC support, so we can use the fusion implementation
+    #[cfg(target_arch = "aarch64")]
+    return fusion::crc32_iso_hdlc(state as u32, data) as u64;
+
+    // x86 CPUs don't have native CRC-32/ISO-HDLC support, so there's no fusion to be had, use
+    // traditional calculation
+    #[cfg(not(target_arch = "aarch64"))]
+    Calculator::calculate(state, data, _params)
 }
 
 #[cfg(test)]
@@ -551,6 +514,7 @@ mod lib {
 
     use super::*;
     use crate::test::consts::{TEST_ALL_CONFIGS, TEST_CHECK_STRING};
+    use crate::test::enums::AnyCrcTestConfig;
     use cbindgen::Language::{Cxx, C};
     use cbindgen::Style::Both;
     use rand::{rng, Rng};
@@ -591,68 +555,52 @@ mod lib {
 
     #[test]
     fn test_small_all_lengths() {
-        let mut rng = rng();
-
-        // Test each CRC-64 variant
         for config in TEST_ALL_CONFIGS {
             // Test each length from 1 to 255
             for len in 1..=255 {
-                // Generate random data for this length
-                let mut data = vec![0u8; len];
-                rng.fill(&mut data[..]);
-
-                // Calculate expected CRC using the reference implementation
-                let expected = config.checksum_with_reference(&data);
-
-                let result = checksum(config.get_algorithm(), &data);
-
-                assert_eq!(result, expected);
+                test_length(len, config);
             }
         }
     }
 
     #[test]
     fn test_medium_lengths() {
-        let mut rng = rng();
-
-        // Test each CRC-64 variant
         for config in TEST_ALL_CONFIGS {
             // Test each length from 256 to 1024, which should fold and include handling remainders
             for len in 256..=1024 {
-                // Generate random data for this length
-                let mut data = vec![0u8; len];
-                rng.fill(&mut data[..]);
-
-                // Calculate expected CRC using the reference implementation
-                let expected = config.checksum_with_reference(&data);
-
-                let result = checksum(config.get_algorithm(), &data);
-
-                assert_eq!(result, expected);
+                test_length(len, config);
             }
         }
     }
 
     #[test]
     fn test_large_lengths() {
-        let mut rng = rng();
-
-        // Test each CRC-64 variant
         for config in TEST_ALL_CONFIGS {
             // Test 1 MiB just before, at, and just after the folding boundaries
             for len in 1048575..1048577 {
-                // Generate random data for this length
-                let mut data = vec![0u8; len];
-                rng.fill(&mut data[..]);
+                test_length(len, config);
+            }
+        }
+    }
 
-                // Calculate expected CRC using the reference implementation
-                let expected = config.checksum_with_reference(&data);
+    fn test_length(length: usize, config: &AnyCrcTestConfig) {
+        let mut data = vec![0u8; length];
+        rng().fill(&mut data[..]);
 
-                let result = checksum(config.get_algorithm(), &data);
+        // Calculate expected CRC using the reference implementation
+        let expected = config.checksum_with_reference(&data);
 
-                assert_eq!(result, expected);
-            }
-        }
+        let result = checksum(config.get_algorithm(), &data);
+
+        assert_eq!(
+            result,
+            expected,
+            "Failed for algorithm: {:?}, length: {}, expected: {:#x}, got: {:#x}",
+            config.get_algorithm(),
+            length,
+            expected,
+            result
+        );
     }
 
     #[test]
@@ -803,126 +751,53 @@ mod lib {
             return Ok(());
         }
 
-        #[cfg(not(target_os = "windows"))]
-        {
-            const HEADER: &str = "libcrc_fast.h";
-
-            let crate_dir =
-                std::env::var("CARGO_MANIFEST_DIR").map_err(|error| error.to_string())?;
-
-            let mut expected = Vec::new();
-            cbindgen::Builder::new()
-                .with_crate(crate_dir)
-                .with_include_guard("CRC_FAST_H")
-                .with_header("/* crc_fast library C/C++ API - Copyright 2025 Don MacAskill */\n/* This header is auto-generated. Do not edit directly. */\n")
-                // exclude internal implementation functions
-                .exclude_item("crc32_iscsi_impl")
-                .exclude_item("crc32_iso_hdlc_impl")
-                .exclude_item("get_iscsi_target")
-                .exclude_item("get_iso_hdlc_target")
-                .exclude_item("ISO_HDLC_TARGET")
-                .exclude_item("ISCSI_TARGET")
-                .exclude_item("CrcParams")
-                .rename_item("Digest", "CrcFastDigest")
-                .with_style(Both)
-                // generate C header
-                .with_language(C)
-                // with C++ compatibility
-                .with_cpp_compat(true)
-                .generate()
-                .map_err(|error| error.to_string())?
-                .write(&mut expected);
-
-            // Convert the expected bytes to string for pattern replacement, since cbindgen
-            // generates an annoying amount of empty contiguous newlines
-            let header_content = String::from_utf8(expected).map_err(|error| error.to_string())?;
-
-            // Replace excessive newlines (3 or more consecutive newlines) with 2 newlines
-            let regex = regex::Regex::new(r"\n{3,}").map_err(|error| error.to_string())?;
-            let cleaned_content = regex.replace_all(&header_content, "\n\n").to_string();
-
-            // Convert back to bytes
-            expected = cleaned_content.into_bytes();
-
-            let actual = read(HEADER).map_err(|error| error.to_string())?;
-
-            if expected != actual {
-                write(HEADER, expected).map_err(|error| error.to_string())?;
-                return Err(format!(
-                    "{HEADER} is not up-to-date, commit the generated file and try again"
-                ));
-            }
-
-            Ok(())
-        }
-    }
-
-    /// Tests whether the CRC-32/ISO-HDLC bindings are up-to-date
-    #[test]
-    #[cfg(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64"))]
-    fn test_crc32_iso_hdlc_bindings() -> Result<(), String> {
-        build_bindgen("crc32_iso_hdlc", "src/bindings/crc32_iso_hdlc.rs")
-    }
-
-    /// Tests whether the CRC-32/ISCSI bindings are up-to-date
-    #[test]
-    #[cfg(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64"))]
-    fn test_crc32_iscsi_bindings() -> Result<(), String> {
-        build_bindgen("crc32_iscsi", "src/bindings/crc32_iscsi.rs")
-    }
-
-    fn build_bindgen(name: &str, bindings_path: &str) -> Result<(), String> {
-        // Getting the Rust cross compile toolchain working on x86 such that it builds correctly
-        // _and_ can validate the header output via bindgen is non-obvious. Since I doubt many
-        // people are actually doing development work on x86, as opposed to x86_64 or aarch64,
-        // I'm just going to skip the bindgen tests on x86. The important tests (do these
-        // CRC-32 variants actually work?) is covered by the other tests, this is just a
-        // development artifact test.
-
-        #[cfg(target_arch = "x86")]
-        {
-            eprintln!("Skipping test on x86 for {} to {}", name, bindings_path);
-
-            return Ok(());
-        }
-
-        // Skip this test on Windows, since CRLF vs LF is a PITA
-        #[cfg(target_os = "windows")]
-        {
-            // Skip this test on Windows, since CRLF vs LF is a PITA
-            eprintln!("Skipping test on Windows");
-
-            return Ok(());
+        const HEADER: &str = "libcrc_fast.h";
+
+        let crate_dir = std::env::var("CARGO_MANIFEST_DIR").map_err(|error| error.to_string())?;
+
+        let mut expected = Vec::new();
+        cbindgen::Builder::new()
+            .with_crate(crate_dir)
+            .with_include_guard("CRC_FAST_H")
+            .with_header("/* crc_fast library C/C++ API - Copyright 2025 Don MacAskill */\n/* This header is auto-generated. Do not edit directly. */\n")
+            // exclude internal implementation functions
+            .exclude_item("crc32_iscsi_impl")
+            .exclude_item("crc32_iso_hdlc_impl")
+            .exclude_item("get_iscsi_target")
+            .exclude_item("get_iso_hdlc_target")
+            .exclude_item("ISO_HDLC_TARGET")
+            .exclude_item("ISCSI_TARGET")
+            .exclude_item("CrcParams")
+            .rename_item("Digest", "CrcFastDigest")
+            .with_style(Both)
+            // generate C header
+            .with_language(C)
+            // with C++ compatibility
+            .with_cpp_compat(true)
+            .generate()
+            .map_err(|error| error.to_string())?
+            .write(&mut expected);
+
+        // Convert the expected bytes to string for pattern replacement, since cbindgen
+        // generates an annoying amount of empty contiguous newlines
+        let header_content = String::from_utf8(expected).map_err(|error| error.to_string())?;
+
+        // Replace excessive newlines (3 or more consecutive newlines) with 2 newlines
+        let regex = regex::Regex::new(r"\n{3,}").map_err(|error| error.to_string())?;
+        let cleaned_content = regex.replace_all(&header_content, "\n\n").to_string();
+
+        // Convert back to bytes
+        expected = cleaned_content.into_bytes();
+
+        let actual = read(HEADER).map_err(|error| error.to_string())?;
+
+        if expected != actual {
+            write(HEADER, expected).map_err(|error| error.to_string())?;
+            return Err(format!(
+                "{HEADER} is not up-to-date, commit the generated file and try again"
+            ));
         }
 
-        #[cfg(not(any(target_arch = "x86", target_os = "windows")))]
-        {
-            let bindings = bindgen::Builder::default()
-                .header(format!("include/{name}.h"))
-                .allowlist_function("crc32_iscsi_impl")
-                .allowlist_function("get_iscsi_target")
-                .allowlist_var("ISCSI_TARGET")
-                .allowlist_function("crc32_iso_hdlc_impl")
-                .allowlist_function("get_iso_hdlc_target")
-                .allowlist_var("ISO_HDLC_TARGET")
-                .generate()
-                .expect("Unable to generate bindings");
-
-            let expected = bindings.to_string().into_bytes();
-
-            let actual = read(bindings_path).map_err(|error| error.to_string())?;
-
-            if expected != actual {
-                bindings
-                    .write_to_file(bindings_path)
-                    .expect("Couldn't write bindings to SRC!");
-
-                return Err(format!(
-                    "{bindings_path} is not up-to-date, commit the generated file and try again"
-                ));
-            }
-
-            Ok(())
-        }
+        Ok(())
     }
 }
diff --git a/src/test/mod.rs b/src/test/mod.rs
index 9ac64c5..1b74ec4 100644
--- a/src/test/mod.rs
+++ b/src/test/mod.rs
@@ -5,8 +5,8 @@
 #![cfg(test)]
 #![allow(dead_code)]
 
-pub mod consts;
-mod enums;
+pub(crate) mod consts;
+pub(crate) mod enums;
 mod structs;
 
 /// Creates a new aligned data vector from the input slice for testing.