diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml index 6a214965d..ff7350f7e 100644 --- a/.github/workflows/benchmarks.yml +++ b/.github/workflows/benchmarks.yml @@ -23,115 +23,121 @@ jobs: date-polyfill: [OFF] shared: [OFF] cpp_standard: [20] - + runs-on: ${{ matrix.os }} name: Benchmark / ${{matrix.os}} / ${{ matrix.compiler }} / ${{ matrix.version }} / ${{ matrix.arch }} / ${{ matrix.config }} / date-polyfill ${{ matrix.date-polyfill}} / shared ${{ matrix.shared }} / cpp${{ matrix.cpp_standard }} env: SCCACHE_GHA_ENABLED: "true" - + steps: + - name: Run sccache-cache + uses: mozilla-actions/sccache-action@v0.0.9 + + - name: Install pthread + run: | + sudo apt-get update + sudo apt-get install -y libpthread-stubs0-dev libboost-thread-dev + + - uses: rui314/setup-mold@v1 + with: + mold-version: 2.40.1 + make-default: true + + - name: Install GCC + if: matrix.compiler == 'gcc' + uses: egor-tensin/setup-gcc@v1 + with: + version: ${{matrix.version}} + platform: x64 + + - name: Install LLVM and Clang + if: matrix.compiler == 'clang' + run: | + wget https://apt.llvm.org/llvm.sh + chmod +x llvm.sh + sudo ./llvm.sh ${{matrix.version}} + sudo apt-get install -y clang-tools-${{matrix.version}} + sudo update-alternatives --install /usr/bin/clang++ clang++ /usr/bin/clang++-${{matrix.version}} 200 + sudo update-alternatives --install /usr/bin/clang clang /usr/bin/clang-${{matrix.version}} 200 + sudo update-alternatives --install /usr/bin/clang-scan-deps clang-scan-deps /usr/bin/clang-scan-deps-${{matrix.version}} 200 + sudo update-alternatives --set clang /usr/bin/clang-${{matrix.version}} + sudo update-alternatives --set clang++ /usr/bin/clang++-${{matrix.version}} + sudo update-alternatives --set clang-scan-deps /usr/bin/clang-scan-deps-${{matrix.version}} + + - name: Install specific version of tzdata + if: matrix.date-polyfill == 'OFF' + run: sudo apt-get install tzdata=2024a-2ubuntu1 -y --allow-downgrades + + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set conda environment + uses: mamba-org/setup-micromamba@main + with: + environment-name: myenv + environment-file: environment-dev.yml + init-shell: bash + cache-downloads: true + + - name: Configure using CMake + run: | + if [[ "${{matrix.compiler}}" = "gcc" ]]; then export CC=gcc-${{matrix.version}}; export CXX=g++-${{matrix.version}}; else export CC=clang; export CXX=clang++; fi + cmake -G Ninja \ + -Bbuild \ + -DCMAKE_BUILD_TYPE:STRING=${{matrix.config}} \ + -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX \ + -DCMAKE_CXX_STANDARD=${{matrix.cpp_standard}} \ + -DUSE_DATE_POLYFILL=${{matrix.date-polyfill}} \ + -DBUILD_TESTS=OFF \ + -DENABLE_INTEGRATION_TEST=OFF \ + -DBUILD_EXAMPLES=OFF \ + -DBUILD_BENCHMARKS=ON \ + -DBUILD_COMPARATIVE_BENCHMARKS=ON \ + -DCMAKE_C_COMPILER_LAUNCHER=sccache \ + -DCMAKE_CXX_COMPILER_LAUNCHER=sccache \ + -DFETCH_DEPENDENCIES_WITH_CMAKE=MISSING \ + -DSPARROW_BUILD_SHARED=${{matrix.shared}} \ + -DCMAKE_EXE_LINKER_FLAGS="-fuse-ld=mold" \ + -DCMAKE_SHARED_LINKER_FLAGS="-fuse-ld=mold" + + - name: Build the benchmark target(s) + working-directory: build + run: | + cmake --build . --config ${{matrix.config}} --target sparrow_benchmarks + cmake --build . --config ${{matrix.config}} --target sparrow_benchmarks_comparative + + - name: Run benchmarks + working-directory: build + run: | + cmake --build . --config ${{matrix.config}} --target run_benchmarks_json + cmake --build . --config ${{matrix.config}} --target run_comparative_benchmarks_json + + - name: Upload benchmark results + uses: actions/upload-artifact@v4 + with: + name: benchmark-results + path: | + ./**/sparrow_benchmarks.json + ./**/sparrow_benchmarks_comparative.json + + - name: Download previous benchmark data + uses: actions/cache@v4 + with: + path: ./cache + key: ${{ runner.os }}-benchmarks + + - name: Store benchmark result + uses: benchmark-action/github-action-benchmark@v1 + with: + # What benchmark tool the output.txt came from + tool: "googlecpp" + # Where the output from the benchmark tool is stored + output-file-path: build/benchmarks/sparrow_benchmarks.json + # Where the previous data file is stored + external-data-json-path: ./cache/benchmark-data.json + # Workflow will fail when an alert happens + fail-on-alert: true - - name: Run sccache-cache - uses: mozilla-actions/sccache-action@v0.0.9 - - - name: Install pthread - run: | - sudo apt-get update - sudo apt-get install -y libpthread-stubs0-dev libboost-thread-dev - - - uses: rui314/setup-mold@v1 - with: - mold-version: 2.40.1 - make-default: true - - - name: Install GCC - if: matrix.compiler == 'gcc' - uses: egor-tensin/setup-gcc@v1 - with: - version: ${{matrix.version}} - platform: x64 - - - name: Install LLVM and Clang - if: matrix.compiler == 'clang' - run: | - wget https://apt.llvm.org/llvm.sh - chmod +x llvm.sh - sudo ./llvm.sh ${{matrix.version}} - sudo apt-get install -y clang-tools-${{matrix.version}} - sudo update-alternatives --install /usr/bin/clang++ clang++ /usr/bin/clang++-${{matrix.version}} 200 - sudo update-alternatives --install /usr/bin/clang clang /usr/bin/clang-${{matrix.version}} 200 - sudo update-alternatives --install /usr/bin/clang-scan-deps clang-scan-deps /usr/bin/clang-scan-deps-${{matrix.version}} 200 - sudo update-alternatives --set clang /usr/bin/clang-${{matrix.version}} - sudo update-alternatives --set clang++ /usr/bin/clang++-${{matrix.version}} - sudo update-alternatives --set clang-scan-deps /usr/bin/clang-scan-deps-${{matrix.version}} - - - name: Install specific version of tzdata - if: matrix.date-polyfill == 'OFF' - run: sudo apt-get install tzdata=2024a-2ubuntu1 -y --allow-downgrades - - - name: Checkout code - uses: actions/checkout@v4 - - - name: Set conda environment - uses: mamba-org/setup-micromamba@main - with: - environment-name: myenv - environment-file: environment-dev.yml - init-shell: bash - cache-downloads: true - - - name: Configure using CMake - run: | - if [[ "${{matrix.compiler}}" = "gcc" ]]; then export CC=gcc-${{matrix.version}}; export CXX=g++-${{matrix.version}}; else export CC=clang; export CXX=clang++; fi - cmake -G Ninja \ - -Bbuild \ - -DCMAKE_BUILD_TYPE:STRING=${{matrix.config}} \ - -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX \ - -DCMAKE_CXX_STANDARD=${{matrix.cpp_standard}} \ - -DUSE_DATE_POLYFILL=${{matrix.date-polyfill}} \ - -DBUILD_TESTS=OFF \ - -DENABLE_INTEGRATION_TEST=OFF \ - -DBUILD_EXAMPLES=OFF \ - -DBUILD_BENCHMARKS=ON \ - -DCMAKE_C_COMPILER_LAUNCHER=sccache \ - -DCMAKE_CXX_COMPILER_LAUNCHER=sccache \ - -DFETCH_DEPENDENCIES_WITH_CMAKE=MISSING \ - -DSPARROW_BUILD_SHARED=${{matrix.shared}} \ - -DCMAKE_EXE_LINKER_FLAGS="-fuse-ld=mold" \ - -DCMAKE_SHARED_LINKER_FLAGS="-fuse-ld=mold" - - - name: Build the benchmark target(s) - working-directory: build - run: cmake --build . --config ${{matrix.config}} --target sparrow_benchmarks - - - name: Run benchmarks - working-directory: build - run: cmake --build . --config ${{matrix.config}} --target run_benchmarks_json - - - name: Upload benchmark results - uses: actions/upload-artifact@v4 - with: - name: benchmark-results - path: ./**/sparrow_benchmarks.json - - - name: Download previous benchmark data - uses: actions/cache@v4 - with: - path: ./cache - key: ${{ runner.os }}-benchmarks - - - name: Store benchmark result - uses: benchmark-action/github-action-benchmark@v1 - with: - # What benchmark tool the output.txt came from - tool: 'googlecpp' - # Where the output from the benchmark tool is stored - output-file-path: build/benchmarks/sparrow_benchmarks.json - # Where the previous data file is stored - external-data-json-path: ./cache/benchmark-data.json - # Workflow will fail when an alert happens - fail-on-alert: true - - - name: Run sccache stat for check - shell: bash - run: ${SCCACHE_PATH} --show-stats + - name: Run sccache stat for check + shell: bash + run: ${SCCACHE_PATH} --show-stats diff --git a/CMakeLists.txt b/CMakeLists.txt index 01602325b..092c4ce5c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -92,6 +92,8 @@ OPTION(BUILD_TESTS "Build sparrow test suite" OFF) MESSAGE(STATUS "🔧 Build tests: ${BUILD_TESTS}") OPTION(BUILD_BENCHMARKS "Build sparrow benchmark suite" OFF) MESSAGE(STATUS "🔧 Build benchmarks: ${BUILD_BENCHMARKS}") +OPTION(BUILD_COMPARATIVE_BENCHMARKS "Build comparative benchmarks" OFF) +MESSAGE(STATUS "🔧 Build comparative benchmarks: ${BUILD_COMPARATIVE_BENCHMARKS}") OPTION(BUILD_DOCS "Build sparrow documentation" OFF) MESSAGE(STATUS "🔧 Build docs: ${BUILD_DOCS}") OPTION(BUILD_EXAMPLES "Build sparrow examples" OFF) diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index d9503edd8..42e293870 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -20,13 +20,6 @@ if(CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR) set(SPARROW_INCLUDE_DIR ${sparrow_INCLUDE_DIRS}) endif() -if(NOT CMAKE_BUILD_TYPE) - message(STATUS "Setting benchmarks build type to Release") - set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build." FORCE) -else() - message(STATUS "Benchmarks build type is ${CMAKE_BUILD_TYPE}") -endif() - set(SPARROW_BENCHMARK_SOURCES main.cpp bench_dynamic_bitset.cpp @@ -74,3 +67,7 @@ set_target_properties(run_benchmarks_json PROPERTIES FOLDER "Benchmarks" ) + +if(BUILD_COMPARATIVE_BENCHMARKS) + add_subdirectory(comparative) +endif() diff --git a/benchmarks/comparative/CMakeLists.txt b/benchmarks/comparative/CMakeLists.txt new file mode 100644 index 000000000..5b21107f0 --- /dev/null +++ b/benchmarks/comparative/CMakeLists.txt @@ -0,0 +1,42 @@ +# Comparative benchmarks between sparrow and Apache Arrow + +message(STATUS "Configuring comparative benchmarks...") + +add_executable(sparrow_benchmarks_comparative + benchmark_primitive.cpp +) + +target_link_libraries(sparrow_benchmarks_comparative + PRIVATE + sparrow + arrow_static + benchmark::benchmark + benchmark::benchmark_main +) + +set(ARROW_INCLUDE_DIRS ${CMAKE_BINARY_DIR}/_deps/arrow-src/cpp/src ${CMAKE_BINARY_DIR}/_deps/arrow-build/src) + +target_include_directories(sparrow_benchmarks_comparative + PRIVATE + ${ARROW_INCLUDE_DIRS} +) + +message("Arrow include dir: ${ARROW_INCLUDE_DIR}") + +set_target_properties(sparrow_benchmarks_comparative PROPERTIES FOLDER "Benchmarks") + +add_custom_target(run_comparative_benchmarks + COMMAND sparrow_benchmarks_comparative + DEPENDS sparrow_benchmarks_comparative + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} + COMMENT "Running comparative benchmarks (sparrow vs Arrow)" +) +set_target_properties(run_comparative_benchmarks PROPERTIES FOLDER "Benchmarks") + +add_custom_target(run_comparative_benchmarks_json + COMMAND sparrow_benchmarks_comparative --benchmark_format=json --benchmark_out=sparrow_benchmarks_comparative.json + DEPENDS sparrow_benchmarks_comparative + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} + COMMENT "Running comparative benchmarks (JSON output)" +) +set_target_properties(run_comparative_benchmarks_json PROPERTIES FOLDER "Benchmarks") diff --git a/benchmarks/comparative/benchmark_primitive.cpp b/benchmarks/comparative/benchmark_primitive.cpp new file mode 100644 index 000000000..9841282a2 --- /dev/null +++ b/benchmarks/comparative/benchmark_primitive.cpp @@ -0,0 +1,588 @@ +// Copyright 2024 Man Group Operations Limited +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include + +namespace +{ + template + std::vector generate_sequential_data(size_t size) + { + std::vector data; + data.reserve(size); + + if constexpr (std::is_same_v) + { + for (size_t i = 0; i < size; ++i) + { + data.push_back(i % 2 == 0); + } + } + else if constexpr (std::is_floating_point_v) + { + for (size_t i = 0; i < size; ++i) + { + data.push_back(static_cast(i) * static_cast(0.1)); + } + } + else + { + for (size_t i = 0; i < size; ++i) + { + data.push_back(static_cast(i)); + } + } + + return data; + } + + template + std::vector> + generate_nullable_data(const std::vector& data, double null_probability, std::mt19937& gen) + { + std::vector> nullable_data; + nullable_data.reserve(data.size()); + std::bernoulli_distribution null_dist(null_probability); + + for (const auto& value : data) + { + if (null_dist(gen)) + { + nullable_data.emplace_back(); // null value + } + else + { + nullable_data.emplace_back(value); + } + } + + return nullable_data; + } + + template + static void BM_Sparrow_CreateArray(benchmark::State& state) + { + const size_t size = static_cast(state.range(0)); + auto data = generate_sequential_data(size); + + for (auto _ : state) + { + sparrow::primitive_array array(data); + benchmark::DoNotOptimize(array); + benchmark::ClobberMemory(); + } + + state.SetItemsProcessed(static_cast(state.iterations() * size)); + } + + template + static void BM_Sparrow_ReadArrayElementAccess(benchmark::State& state) + { + const size_t size = static_cast(state.range(0)); + auto data = generate_sequential_data(size); + const sparrow::primitive_array array(data); + + T sum = T{}; + size_t index = 0; + const auto values = array.values(); + const auto bitmap = array.bitmap(); + + for (auto _ : state) + { + const size_t real_index = index % size; + if (bitmap[real_index]) + { + sum += values[real_index]; + } + index++; + benchmark::DoNotOptimize(sum); + } + state.SetItemsProcessed(static_cast(state.iterations())); + } + + // Sparrow Array Reading Benchmarks - Range-based for loop + template + static void BM_Sparrow_ReadArrayRangeFor(benchmark::State& state) + { + const size_t size = static_cast(state.range(0)); + auto data = generate_sequential_data(size); + sparrow::primitive_array array(data); + + for (auto _ : state) + { + T sum = T{}; + for (const auto& element : array) + { + if (element.has_value()) + { + sum += element.value(); + } + } + benchmark::DoNotOptimize(sum); + benchmark::ClobberMemory(); + } + + state.SetItemsProcessed(static_cast(state.iterations() * size)); + } + + // Arrow Array Creation Benchmarks + template + static void BM_Arrow_CreateArray(benchmark::State& state) + { + const size_t size = static_cast(state.range(0)); + auto data = generate_sequential_data(size); + + for (auto _ : state) + { + // Create Arrow array using the builder pattern + std::shared_ptr array; + + if constexpr (std::is_same_v) + { + arrow::Int32Builder builder; + builder.AppendValues(data); + auto maybe_array = builder.Finish(); + array = *maybe_array; + } + else if constexpr (std::is_same_v) + { + arrow::Int64Builder builder; + builder.AppendValues(data); + auto maybe_array = builder.Finish(); + array = *maybe_array; + } + else if constexpr (std::is_same_v) + { + arrow::FloatBuilder builder; + builder.AppendValues(data); + auto maybe_array = builder.Finish(); + array = *maybe_array; + } + else if constexpr (std::is_same_v) + { + arrow::DoubleBuilder builder; + builder.AppendValues(data); + auto maybe_array = builder.Finish(); + array = *maybe_array; + } + else if constexpr (std::is_same_v) + { + arrow::UInt32Builder builder; + builder.AppendValues(data); + auto maybe_array = builder.Finish(); + array = *maybe_array; + } + else if constexpr (std::is_same_v) + { + arrow::UInt64Builder builder; + builder.AppendValues(data); + auto maybe_array = builder.Finish(); + array = *maybe_array; + } + else if constexpr (std::is_same_v) + { + arrow::BooleanBuilder builder; + builder.AppendValues(data); + auto maybe_array = builder.Finish(); + array = *maybe_array; + } + + benchmark::DoNotOptimize(array); + benchmark::ClobberMemory(); + } + + state.SetItemsProcessed(static_cast(state.iterations() * size)); + } + + template + std::shared_ptr create_arrow_array(size_t size) + { + auto data = generate_sequential_data(size); + std::shared_ptr array; + + auto build_array = [&]() + { + U builder; + builder.AppendValues(data); + auto maybe_array = builder.Finish(); + array = *maybe_array; + }; + + if constexpr (std::is_same_v) + { + build_array.template operator()(); + } + else if constexpr (std::is_same_v) + { + build_array.template operator()(); + } + else if constexpr (std::is_same_v) + { + build_array.template operator()(); + } + else if constexpr (std::is_same_v) + { + build_array.template operator()(); + } + else if constexpr (std::is_same_v) + { + build_array.template operator()(); + } + else if constexpr (std::is_same_v) + { + build_array.template operator()(); + } + else if constexpr (std::is_same_v) + { + build_array.template operator()(); + } + else if constexpr (std::is_same_v) + { + build_array.template operator()(); + } + else if constexpr (std::is_same_v) + { + build_array.template operator()(); + } + return array; + } + + // Arrow Array Reading Benchmarks - Element Access + template + static void BM_Arrow_ReadArrayElementAccess(benchmark::State& state) + { + const size_t size = static_cast(state.range(0)); + const auto array = create_arrow_array(size); + const auto bench = [&]() + { + const auto typed_array = std::static_pointer_cast(array); + T sum = T{}; + size_t index = 0; + for (auto _ : state) + { + const size_t real_index = index % size; + if (!typed_array->IsNull(real_index)) + { + sum += typed_array->Value(real_index); + } + index++; + benchmark::DoNotOptimize(sum); + } + state.SetItemsProcessed(static_cast(state.iterations())); + }; + + if constexpr (std::is_same_v) + { + bench.template operator()(); + } + else if constexpr (std::is_same_v) + { + bench.template operator()(); + } + else if constexpr (std::is_same_v) + { + bench.template operator()(); + } + else if constexpr (std::is_same_v) + { + bench.template operator()(); + } + else if constexpr (std::is_same_v) + { + bench.template operator()(); + } + else if constexpr (std::is_same_v) + { + bench.template operator()(); + } + else if constexpr (std::is_same_v) + { + bench.template operator()(); + } + else if constexpr (std::is_same_v) + { + bench.template operator()(); + } + else if constexpr (std::is_same_v) + { + bench.template operator()(); + } + } + + // Arrow Array Reading Benchmarks - Raw values access + template + static void BM_Arrow_ReadArrayRawValues(benchmark::State& state) + { + const size_t size = static_cast(state.range(0)); + + // Create Arrow array + const std::shared_ptr array = create_arrow_array(size); + + const auto bench = [&]() + { + auto typed_array = std::static_pointer_cast(array); + if constexpr (std::is_same_v) + { + for (auto _ : state) + { + // For bool arrays, we iterate through each bit + for (size_t i = 0; i < size; ++i) + { + volatile bool val = typed_array->Value(i); + benchmark::DoNotOptimize(val); + } + } + } + else + { + const auto* raw_values = typed_array->raw_values(); + + T sum = T{}; + for (auto _ : state) + { + for (size_t i = 0; i < size; ++i) + { + sum += raw_values[i]; + } + benchmark::DoNotOptimize(sum); + } + } + }; + + if constexpr (std::is_same_v) + { + bench.template operator()(); + } + else if constexpr (std::is_same_v) + { + bench.template operator()(); + } + else if constexpr (std::is_same_v) + { + bench.template operator()(); + } + else if constexpr (std::is_same_v) + { + bench.template operator()(); + } + else if constexpr (std::is_same_v) + { + bench.template operator()(); + } + else if constexpr (std::is_same_v) + { + bench.template operator()(); + } + else if constexpr (std::is_same_v) + { + bench.template operator()(); + } + else if constexpr (std::is_same_v) + { + bench.template operator()(); + } + else if constexpr (std::is_same_v) + { + bench.template operator()(); + } + benchmark::ClobberMemory(); + state.SetItemsProcessed(static_cast(state.iterations() * size)); + } + + // Benchmarks with null values + template + static void BM_Sparrow_CreateArrayWithNulls(benchmark::State& state) + { + const size_t size = static_cast(state.range(0)); + std::mt19937 gen(42); // Fixed seed for reproducibility + auto data = generate_sequential_data(size); + auto nullable_data = generate_nullable_data(data, 0.1, gen); // 10% null values + + for (auto _ : state) + { + sparrow::primitive_array array(nullable_data); + benchmark::DoNotOptimize(array); + benchmark::ClobberMemory(); + } + + state.SetItemsProcessed(static_cast(state.iterations() * size)); + } + + template + static void BM_Arrow_CreateArrayWithNulls(benchmark::State& state) + { + const size_t size = static_cast(state.range(0)); + auto data = generate_sequential_data(size); + + std::mt19937 gen(42); // Fixed seed for reproducibility - same as Sparrow + + // Generate validity vector (10% nulls) + std::vector validity; + validity.reserve(size); + std::bernoulli_distribution null_dist(0.1); + for (size_t i = 0; i < size; ++i) + { + validity.push_back(!null_dist(gen)); // true means valid, false means null + } + + for (auto _ : state) + { + std::shared_ptr array; + + if constexpr (std::is_same_v) + { + arrow::Int8Builder builder; + builder.AppendValues(data, validity); + auto maybe_array = builder.Finish(); + array = *maybe_array; + } + else if constexpr (std::is_same_v) + { + arrow::Int16Builder builder; + builder.AppendValues(data, validity); + auto maybe_array = builder.Finish(); + array = *maybe_array; + } + else if constexpr (std::is_same_v) + { + arrow::Int32Builder builder; + builder.AppendValues(data, validity); + auto maybe_array = builder.Finish(); + array = *maybe_array; + } + else if constexpr (std::is_same_v) + { + arrow::Int64Builder builder; + builder.AppendValues(data, validity); + auto maybe_array = builder.Finish(); + array = *maybe_array; + } + else if constexpr (std::is_same_v) + { + arrow::FloatBuilder builder; + builder.AppendValues(data, validity); + auto maybe_array = builder.Finish(); + array = *maybe_array; + } + else if constexpr (std::is_same_v) + { + arrow::DoubleBuilder builder; + builder.AppendValues(data, validity); + auto maybe_array = builder.Finish(); + array = *maybe_array; + } + else if constexpr (std::is_same_v) + { + arrow::UInt32Builder builder; + builder.AppendValues(data, validity); + auto maybe_array = builder.Finish(); + array = *maybe_array; + } + else if constexpr (std::is_same_v) + { + arrow::UInt64Builder builder; + builder.AppendValues(data, validity); + auto maybe_array = builder.Finish(); + array = *maybe_array; + } + else if constexpr (std::is_same_v) + { + arrow::BooleanBuilder builder; + builder.AppendValues(data, validity); + auto maybe_array = builder.Finish(); + array = *maybe_array; + } + + benchmark::DoNotOptimize(array); + benchmark::ClobberMemory(); + } + + state.SetItemsProcessed(static_cast(state.iterations() * size)); + } + +// Macro to register all benchmarks for a specific type +#define REGISTER_PRIMITIVE_BENCHMARKS(TYPE, NAME) \ + BENCHMARK_TEMPLATE(BM_Sparrow_CreateArray, TYPE) \ + ->Name("Sparrow_CreateArray_" NAME) \ + ->RangeMultiplier(10) \ + ->Range(100, 100000) \ + ->Unit(benchmark::kMicrosecond); \ + \ + BENCHMARK_TEMPLATE(BM_Arrow_CreateArray, TYPE) \ + ->Name("Arrow_CreateArray_" NAME) \ + ->RangeMultiplier(10) \ + ->Range(100, 100000) \ + ->Unit(benchmark::kMicrosecond); \ + \ + BENCHMARK_TEMPLATE(BM_Sparrow_ReadArrayElementAccess, TYPE) \ + ->Name("Sparrow_ReadArray_ElementAccess_" NAME) \ + ->RangeMultiplier(10) \ + ->Range(100, 100000) \ + ->Unit(benchmark::kNanosecond); \ + \ + BENCHMARK_TEMPLATE(BM_Arrow_ReadArrayElementAccess, TYPE) \ + ->Name("Arrow_ReadArray_ElementAccess_" NAME) \ + ->RangeMultiplier(10) \ + ->Range(100, 100000) \ + ->Unit(benchmark::kNanosecond); \ + \ + BENCHMARK_TEMPLATE(BM_Sparrow_ReadArrayRangeFor, TYPE) \ + ->Name("Sparrow_ReadArray_RangeFor_" NAME) \ + ->RangeMultiplier(10) \ + ->Range(100, 100000) \ + ->Unit(benchmark::kMicrosecond); \ + \ + BENCHMARK_TEMPLATE(BM_Arrow_ReadArrayRawValues, TYPE) \ + ->Name("Arrow_ReadArray_RawValues_" NAME) \ + ->RangeMultiplier(10) \ + ->Range(100, 100000) \ + ->Unit(benchmark::kMicrosecond); \ + \ + BENCHMARK_TEMPLATE(BM_Sparrow_CreateArrayWithNulls, TYPE) \ + ->Name("Sparrow_CreateArrayWithNulls_" NAME) \ + ->RangeMultiplier(10) \ + ->Range(100, 100000) \ + ->Unit(benchmark::kMicrosecond); \ + \ + BENCHMARK_TEMPLATE(BM_Arrow_CreateArrayWithNulls, TYPE) \ + ->Name("Arrow_CreateArrayWithNulls_" NAME) \ + ->RangeMultiplier(10) \ + ->Range(100, 100000) \ + ->Unit(benchmark::kMicrosecond); + +} // namespace + +// Register benchmarks for all types +REGISTER_PRIMITIVE_BENCHMARKS(std::int8_t, "Int8") +REGISTER_PRIMITIVE_BENCHMARKS(std::int16_t, "Int16") +REGISTER_PRIMITIVE_BENCHMARKS(std::int32_t, "Int32") +REGISTER_PRIMITIVE_BENCHMARKS(std::int64_t, "Int64") +REGISTER_PRIMITIVE_BENCHMARKS(std::uint32_t, "UInt32") +REGISTER_PRIMITIVE_BENCHMARKS(std::uint64_t, "UInt64") +REGISTER_PRIMITIVE_BENCHMARKS(float, "Float") +REGISTER_PRIMITIVE_BENCHMARKS(double, "Double") +REGISTER_PRIMITIVE_BENCHMARKS(bool, "Bool") + +#undef REGISTER_PRIMITIVE_BENCHMARKS diff --git a/cmake/external_dependencies.cmake b/cmake/external_dependencies.cmake index 51cececfe..8dc625df4 100644 --- a/cmake/external_dependencies.cmake +++ b/cmake/external_dependencies.cmake @@ -114,4 +114,140 @@ if(BUILD_BENCHMARKS) set_target_properties(benchmark_main PROPERTIES FOLDER "GoogleBenchmark") endif() endif() + if(BUILD_COMPARATIVE_BENCHMARKS) + if(NOT FETCH_DEPENDENCIES_WITH_CMAKE STREQUAL "ON") + find_package(arrow CONFIG ${FIND_PACKAGE_OPTIONS}) + endif() + if(FETCH_DEPENDENCIES_WITH_CMAKE STREQUAL "ON" OR FETCH_DEPENDENCIES_WITH_CMAKE STREQUAL "MISSING") + if(NOT arrow_FOUND) + set(ARROW_VERSION "21.0.0") + message(STATUS "📦 Fetching Apache Arrow ${ARROW_VERSION}") + + # Set minimal build options to avoid dependencies + set(ARROW_BUILD_SHARED OFF) + set(ARROW_BUILD_STATIC ON) + set(ARROW_BUILD_TESTS OFF) + set(ARROW_BUILD_BENCHMARKS OFF) + set(ARROW_BUILD_EXAMPLES OFF) + set(ARROW_BUILD_INTEGRATION OFF) + set(ARROW_BUILD_UTILITIES OFF) + + # Disable all optional features and dependencies + set(ARROW_GANDIVA OFF) + set(ARROW_PARQUET OFF) + set(ARROW_SUBSTRAIT OFF) + set(ARROW_ACERO OFF) + set(ARROW_COMPUTE OFF) + set(ARROW_DATASET OFF) + set(ARROW_FILESYSTEM OFF) + set(ARROW_HDFS OFF) + set(ARROW_FLIGHT OFF) + set(ARROW_FLIGHT_SQL OFF) + set(ARROW_CUDA OFF) + set(ARROW_CSV OFF) + set(ARROW_JSON OFF) + set(ARROW_S3 OFF) + set(ARROW_GCS OFF) + set(ARROW_ORC OFF) + + # Disable compression libraries + set(ARROW_WITH_BROTLI OFF) + set(ARROW_WITH_BZ2 OFF) + set(ARROW_WITH_LZ4 OFF) + set(ARROW_WITH_SNAPPY OFF) + set(ARROW_WITH_ZLIB OFF) + set(ARROW_WITH_ZSTD OFF) + + # Disable other optional dependencies + set(ARROW_WITH_BACKTRACE OFF) + set(ARROW_WITH_THRIFT OFF) + set(ARROW_WITH_PROTOBUF OFF) + set(ARROW_WITH_GRPC OFF) + set(ARROW_WITH_GFLAGS OFF) + set(ARROW_WITH_GLOG OFF) + set(ARROW_USE_GLOG OFF) + set(ARROW_WITH_UTF8PROC OFF) + set(ARROW_WITH_RE2 OFF) + set(ARROW_USE_OPENSSL OFF) + set(ARROW_WITH_OPENSSL OFF) + set(ARROW_JEMALLOC OFF) + set(ARROW_MIMALLOC OFF) + set(ARROW_USE_BOOST OFF) + set(ARROW_BOOST_REQUIRED OFF) + + # Set SIMD level to none to avoid xsimd dependency + set(ARROW_SIMD_LEVEL NONE) + set(ARROW_RUNTIME_SIMD_LEVEL NONE) + + # Disable deprecated API + set(ARROW_NO_DEPRECATED_API ON) + + # Use bundled dependencies for remaining required components + set(ARROW_DEPENDENCY_SOURCE BUNDLED) + + FetchContent_Declare( + arrow + GIT_SHALLOW TRUE + GIT_REPOSITORY https://github.com/apache/arrow.git + GIT_TAG apache-arrow-${ARROW_VERSION} + GIT_PROGRESS TRUE + SYSTEM + EXCLUDE_FROM_ALL + SOURCE_SUBDIR cpp) + FetchContent_MakeAvailable(arrow) + + # Clean up cache variables + unset(ARROW_BUILD_SHARED CACHE) + unset(ARROW_BUILD_STATIC CACHE) + unset(ARROW_BUILD_TESTS CACHE) + unset(ARROW_BUILD_BENCHMARKS CACHE) + unset(ARROW_BUILD_EXAMPLES CACHE) + unset(ARROW_BUILD_INTEGRATION CACHE) + unset(ARROW_BUILD_UTILITIES CACHE) + unset(ARROW_GANDIVA CACHE) + unset(ARROW_PARQUET CACHE) + unset(ARROW_SUBSTRAIT CACHE) + unset(ARROW_ACERO CACHE) + unset(ARROW_COMPUTE CACHE) + unset(ARROW_DATASET CACHE) + unset(ARROW_FILESYSTEM CACHE) + unset(ARROW_HDFS CACHE) + unset(ARROW_FLIGHT CACHE) + unset(ARROW_FLIGHT_SQL CACHE) + unset(ARROW_CUDA CACHE) + unset(ARROW_CSV CACHE) + unset(ARROW_JSON CACHE) + unset(ARROW_S3 CACHE) + unset(ARROW_GCS CACHE) + unset(ARROW_ORC CACHE) + unset(ARROW_WITH_BROTLI CACHE) + unset(ARROW_WITH_BZ2 CACHE) + unset(ARROW_WITH_LZ4 CACHE) + unset(ARROW_WITH_SNAPPY CACHE) + unset(ARROW_WITH_ZLIB CACHE) + unset(ARROW_WITH_ZSTD CACHE) + unset(ARROW_WITH_BACKTRACE CACHE) + unset(ARROW_WITH_THRIFT CACHE) + unset(ARROW_WITH_PROTOBUF CACHE) + unset(ARROW_WITH_GRPC CACHE) + unset(ARROW_WITH_GFLAGS CACHE) + unset(ARROW_WITH_GLOG CACHE) + unset(ARROW_USE_GLOG CACHE) + unset(ARROW_WITH_UTF8PROC CACHE) + unset(ARROW_WITH_RE2 CACHE) + unset(ARROW_USE_OPENSSL CACHE) + unset(ARROW_WITH_OPENSSL CACHE) + unset(ARROW_JEMALLOC CACHE) + unset(ARROW_MIMALLOC CACHE) + unset(ARROW_USE_BOOST CACHE) + unset(ARROW_BOOST_REQUIRED CACHE) + unset(ARROW_SIMD_LEVEL CACHE) + unset(ARROW_RUNTIME_SIMD_LEVEL CACHE) + unset(ARROW_NO_DEPRECATED_API CACHE) + unset(ARROW_DEPENDENCY_SOURCE CACHE) + + message(STATUS "\t✅ Fetched Apache Arrow ${ARROW_VERSION}") + endif() + endif() + endif() endif() diff --git a/conanfile.py b/conanfile.py index 151ed1ebb..16adb46bb 100644 --- a/conanfile.py +++ b/conanfile.py @@ -26,6 +26,7 @@ class SparrowRecipe(ConanFile): "use_date_polyfill": [True, False], "build_tests": [True, False], "build_benchmarks": [True, False], + "build_comparative_benchmarks": [True, False], "generate_documentation": [True, False], } default_options = { @@ -34,23 +35,26 @@ class SparrowRecipe(ConanFile): "use_date_polyfill": False, "build_tests": False, "build_benchmarks": False, + "build_comparative_benchmarks": False, "generate_documentation": False, } def requirements(self): if self.options.get_safe("use_date_polyfill"): - self.requires("date/3.0.3") + self.requires("date/3.0.4") if self.options.get_safe("build_tests"): - self.test_requires("doctest/2.4.11") - self.test_requires("catch2/3.7.0") + self.test_requires("doctest/2.4.12") + self.test_requires("catch2/3.10.0") self.test_requires("nlohmann_json/3.12.0") if self.options.get_safe("build_benchmarks"): self.test_requires("benchmark/1.9.4") + if self.options.get_safe("build_comparative_benchmarks"): + self.test_requires("arrow/21.0.0") def build_requirements(self): self.tool_requires("cmake/[>=3.28.1 <4.2.0]") if self.options.get_safe("generate_documentation"): - self.tool_requires("doxygen/1.9.4", options={"enable_app": "True"}) + self.tool_requires("doxygen/1.14.0", options={"enable_app": "True"}) @property def _min_cppstd(self): @@ -98,6 +102,9 @@ def generate(self): tc.variables["BUILD_BENCHMARKS"] = self.options.get_safe( "build_benchmarks", False ) + tc.variables["BUILD_COMPARATIVE_BENCHMARKS"] = self.options.get_safe( + "build_comparative_benchmarks", False + ) if is_msvc(self): tc.variables["USE_LARGE_INT_PLACEHOLDERS"] = True tc.generate() diff --git a/docs/source/typed_array.md b/docs/source/typed_array.md index 6089eaf63..20034ed4a 100644 --- a/docs/source/typed_array.md +++ b/docs/source/typed_array.md @@ -86,7 +86,7 @@ Typed arrays provide the following const methods to read elements: | back | Access the last element | For an array holding data of type `T`, these methods return values -of type `nullable`. +of type `nullable`. Using `nullable` allows you to handle potential null values gracefully but has a performance penalty. Example: @@ -105,6 +105,27 @@ try { } ``` +### Ranges + +Typed arrays provide the following method to access a range of elements: + +| Method | Description | +| ------ | ----------------------------------------| +| values | Returns the raw values as a range. | +| bitmap | Returns the validity bitmap as a range. | + +The `values` method returns a range of type `span`, allowing efficient access to the underlying data. This is particularly useful for operations that need to process all values, including nulls. + +```cpp +#include "sparrow.hpp" +namespace sp = sparrow; + +sp::primitive_array pa = {1, 2, 3, 4}; +for (auto v : pa.values()) { + std::cout << v << ' '; // Prints 1 2 3 4 +} +``` + ### Iterators Typed arrays also provide traditional iteration methods: @@ -120,6 +141,8 @@ Typed arrays also provide traditional iteration methods: | rend | Returns a reverse iterator to the end | | crend | Returns a reverse iterator to the end | +These methods return iterators that dereference to `nullable`. + Example: ```cpp