Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,7 @@ ConfigureNVBench(
stream_compaction/distinct.cpp
stream_compaction/distinct_count.cpp
stream_compaction/stable_distinct.cpp
stream_compaction/stream_compaction_common.cpp
stream_compaction/unique.cpp
stream_compaction/unique_count.cpp
)
Expand Down
47 changes: 31 additions & 16 deletions cpp/benchmarks/stream_compaction/distinct.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2023, NVIDIA CORPORATION.
* Copyright (c) 2020-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -15,6 +15,7 @@
*/

#include <benchmarks/common/generate_input.hpp>
#include <benchmarks/stream_compaction/stream_compaction_common.hpp>

#include <cudf/column/column_view.hpp>
#include <cudf/lists/list_view.hpp>
Expand All @@ -23,15 +24,29 @@

#include <nvbench/nvbench.cuh>

#include <limits>

NVBENCH_DECLARE_TYPE_STRINGS(cudf::timestamp_ms, "cudf::timestamp_ms", "cudf::timestamp_ms");

template <typename Type>
void nvbench_distinct(nvbench::state& state, nvbench::type_list<Type>)
{
cudf::size_type const num_rows = state.get_int64("NumRows");
cudf::size_type const num_rows = state.get_int64("NumRows");
auto const keep = get_keep(state.get_string("keep"));
cudf::size_type const cardinality = state.get_int64("cardinality");

if (cardinality > num_rows) {
state.skip("cardinality > num_rows");
return;
}
Comment on lines +38 to +41
Copy link
Contributor

@GregoryKimball GregoryKimball Aug 7, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would prefer to omit this skipping condition. I recognize that we can't have 1M distinct elements in 1K rows, but this condition adds a lot of friction when sweeping NumRows for the high cardinality case. It forces me to run a full factorial of matching NumRows and Cardinality values and filter the outputs for the highest Cardinality unskipped for each NumRows.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll rewrite this logic. Thanks for the feedback!

Copy link
Contributor Author

@bdice bdice Aug 7, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm. @GregoryKimball I reviewed the NVBench docs and I don't see a way to filter out certain jobs except by skipping them. https://github.com/NVIDIA/nvbench/blob/main/docs/benchmarks.md#beware-combinatorial-explosion-is-lurking

We might be able to use a string axis like {"100,100", "100,1000", ..., "1000000000,1000000000"} and parse it, but that's hard to maintain.

Copy link
Member

@PointKernel PointKernel Aug 7, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't see a way to filter out certain jobs except by skipping them.

NVIDIA/nvbench#80 can solve this issue but the PR has been stalled for a while.


data_profile profile = data_profile_builder().cardinality(0).null_probability(0.01).distribution(
cudf::type_to_id<Type>(), distribution_id::UNIFORM, 0, 100);
data_profile profile = data_profile_builder()
.cardinality(cardinality)
.null_probability(0.01)
.distribution(cudf::type_to_id<Type>(),
distribution_id::UNIFORM,
static_cast<Type>(0),
std::numeric_limits<Type>::max());

auto source_column = create_random_column(cudf::type_to_id<Type>(), row_count{num_rows}, profile);

Expand All @@ -40,27 +55,29 @@ void nvbench_distinct(nvbench::state& state, nvbench::type_list<Type>)

state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
auto result = cudf::distinct(input_table,
{0},
cudf::duplicate_keep_option::KEEP_ANY,
cudf::null_equality::EQUAL,
cudf::nan_equality::ALL_EQUAL);
auto result = cudf::distinct(
input_table, {0}, keep, cudf::null_equality::EQUAL, cudf::nan_equality::ALL_EQUAL);
});
}

using data_type = nvbench::type_list<bool, int8_t, int32_t, int64_t, float, cudf::timestamp_ms>;
using data_type = nvbench::type_list<int32_t, int64_t>;

NVBENCH_BENCH_TYPES(nvbench_distinct, NVBENCH_TYPE_AXES(data_type))
.set_name("distinct")
.set_type_axes_names({"Type"})
.add_int64_axis("NumRows", {10'000, 100'000, 1'000'000, 10'000'000});
.add_string_axis("keep", {"any", "first", "last", "none"})
.add_int64_axis("cardinality",
{100, 100'000, 10'000'000, 1'000'000'000})
.add_int64_axis("NumRows",
{100, 100'000, 10'000'000, 1'000'000'000});

template <typename Type>
void nvbench_distinct_list(nvbench::state& state, nvbench::type_list<Type>)
{
auto const size = state.get_int64("ColumnSize");
auto const dtype = cudf::type_to_id<Type>();
double const null_probability = state.get_float64("null_probability");
auto const keep = get_keep(state.get_string("keep"));

auto builder = data_profile_builder().null_probability(null_probability);
if (dtype == cudf::type_id::LIST) {
Expand All @@ -80,17 +97,15 @@ void nvbench_distinct_list(nvbench::state& state, nvbench::type_list<Type>)

state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
auto result = cudf::distinct(*table,
{0},
cudf::duplicate_keep_option::KEEP_ANY,
cudf::null_equality::EQUAL,
cudf::nan_equality::ALL_EQUAL);
auto result =
cudf::distinct(*table, {0}, keep, cudf::null_equality::EQUAL, cudf::nan_equality::ALL_EQUAL);
});
}

NVBENCH_BENCH_TYPES(nvbench_distinct_list,
NVBENCH_TYPE_AXES(nvbench::type_list<int32_t, cudf::list_view>))
.set_name("distinct_list")
.set_type_axes_names({"Type"})
.add_string_axis("keep", {"any", "first", "last", "none"})
.add_float64_axis("null_probability", {0.0, 0.1})
.add_int64_axis("ColumnSize", {100'000'000});
47 changes: 31 additions & 16 deletions cpp/benchmarks/stream_compaction/stable_distinct.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
* Copyright (c) 2023-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -15,6 +15,7 @@
*/

#include <benchmarks/common/generate_input.hpp>
#include <benchmarks/stream_compaction/stream_compaction_common.hpp>

#include <cudf/column/column_view.hpp>
#include <cudf/lists/list_view.hpp>
Expand All @@ -23,15 +24,29 @@

#include <nvbench/nvbench.cuh>

#include <limits>

NVBENCH_DECLARE_TYPE_STRINGS(cudf::timestamp_ms, "cudf::timestamp_ms", "cudf::timestamp_ms");

template <typename Type>
void nvbench_stable_distinct(nvbench::state& state, nvbench::type_list<Type>)
{
cudf::size_type const num_rows = state.get_int64("NumRows");
cudf::size_type const num_rows = state.get_int64("NumRows");
auto const keep = get_keep(state.get_string("keep"));
cudf::size_type const cardinality = state.get_int64("cardinality");

if (cardinality > num_rows) {
state.skip("cardinality > num_rows");
return;
}

data_profile profile = data_profile_builder().cardinality(0).null_probability(0.01).distribution(
cudf::type_to_id<Type>(), distribution_id::UNIFORM, 0, 100);
data_profile profile = data_profile_builder()
.cardinality(cardinality)
.null_probability(0.01)
.distribution(cudf::type_to_id<Type>(),
distribution_id::UNIFORM,
static_cast<Type>(0),
std::numeric_limits<Type>::max());

auto source_column = create_random_column(cudf::type_to_id<Type>(), row_count{num_rows}, profile);

Expand All @@ -40,27 +55,29 @@ void nvbench_stable_distinct(nvbench::state& state, nvbench::type_list<Type>)

state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
auto result = cudf::stable_distinct(input_table,
{0},
cudf::duplicate_keep_option::KEEP_ANY,
cudf::null_equality::EQUAL,
cudf::nan_equality::ALL_EQUAL);
auto result = cudf::stable_distinct(
input_table, {0}, keep, cudf::null_equality::EQUAL, cudf::nan_equality::ALL_EQUAL);
});
}

using data_type = nvbench::type_list<bool, int8_t, int32_t, int64_t, float, cudf::timestamp_ms>;
using data_type = nvbench::type_list<int32_t, int64_t>;

NVBENCH_BENCH_TYPES(nvbench_stable_distinct, NVBENCH_TYPE_AXES(data_type))
.set_name("stable_distinct")
.set_type_axes_names({"Type"})
.add_int64_axis("NumRows", {10'000, 100'000, 1'000'000, 10'000'000});
.add_string_axis("keep", {"any", "first", "last", "none"})
.add_int64_axis("cardinality",
{100, 100'000, 10'000'000, 1'000'000'000})
.add_int64_axis("NumRows",
{100, 100'000, 10'000'000, 1'000'000'000});

template <typename Type>
void nvbench_stable_distinct_list(nvbench::state& state, nvbench::type_list<Type>)
{
auto const size = state.get_int64("ColumnSize");
auto const dtype = cudf::type_to_id<Type>();
double const null_probability = state.get_float64("null_probability");
auto const keep = get_keep(state.get_string("keep"));

auto builder = data_profile_builder().null_probability(null_probability);
if (dtype == cudf::type_id::LIST) {
Expand All @@ -80,17 +97,15 @@ void nvbench_stable_distinct_list(nvbench::state& state, nvbench::type_list<Type

state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
auto result = cudf::stable_distinct(*table,
{0},
cudf::duplicate_keep_option::KEEP_ANY,
cudf::null_equality::EQUAL,
cudf::nan_equality::ALL_EQUAL);
auto result = cudf::stable_distinct(
*table, {0}, keep, cudf::null_equality::EQUAL, cudf::nan_equality::ALL_EQUAL);
});
}

NVBENCH_BENCH_TYPES(nvbench_stable_distinct_list,
NVBENCH_TYPE_AXES(nvbench::type_list<int32_t, cudf::list_view>))
.set_name("stable_distinct_list")
.set_type_axes_names({"Type"})
.add_string_axis("keep", {"any", "first", "last", "none"})
.add_float64_axis("null_probability", {0.0, 0.1})
.add_int64_axis("ColumnSize", {100'000'000});
35 changes: 35 additions & 0 deletions cpp/benchmarks/stream_compaction/stream_compaction_common.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
/*
* Copyright (c) 2020-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <benchmarks/stream_compaction/stream_compaction_common.hpp>

#include <cudf/stream_compaction.hpp>
#include <cudf/utilities/error.hpp>

cudf::duplicate_keep_option get_keep(std::string const& keep_str)
{
if (keep_str == "any") {
return cudf::duplicate_keep_option::KEEP_ANY;
} else if (keep_str == "first") {
return cudf::duplicate_keep_option::KEEP_FIRST;
} else if (keep_str == "last") {
return cudf::duplicate_keep_option::KEEP_LAST;
} else if (keep_str == "none") {
return cudf::duplicate_keep_option::KEEP_NONE;
} else {
CUDF_FAIL("Unsupported keep option.");
}
}
19 changes: 19 additions & 0 deletions cpp/benchmarks/stream_compaction/stream_compaction_common.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
/*
* Copyright (c) 2020-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <cudf/stream_compaction.hpp>

cudf::duplicate_keep_option get_keep(std::string const& keep_str);