Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 45 additions & 16 deletions cpp/benchmarks/stream_compaction/distinct.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2023, NVIDIA CORPORATION.
* Copyright (c) 2020-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -23,15 +23,44 @@

#include <nvbench/nvbench.cuh>

#include <limits>

NVBENCH_DECLARE_TYPE_STRINGS(cudf::timestamp_ms, "cudf::timestamp_ms", "cudf::timestamp_ms");

cudf::duplicate_keep_option get_keep(std::string const& keep_str)
{
if (keep_str == "any") {
return cudf::duplicate_keep_option::KEEP_ANY;
} else if (keep_str == "first") {
return cudf::duplicate_keep_option::KEEP_FIRST;
} else if (keep_str == "last") {
return cudf::duplicate_keep_option::KEEP_LAST;
} else if (keep_str == "none") {
return cudf::duplicate_keep_option::KEEP_NONE;
} else {
CUDF_FAIL("Unsupported keep option.");
}
}

template <typename Type>
void nvbench_distinct(nvbench::state& state, nvbench::type_list<Type>)
{
cudf::size_type const num_rows = state.get_int64("NumRows");
cudf::size_type const num_rows = state.get_int64("NumRows");
auto const keep = get_keep(state.get_string("keep"));
cudf::size_type const cardinality = state.get_int64("cardinality");

if (cardinality > num_rows) {
state.skip("cardinality > num_rows");
return;
}
Comment on lines +38 to +41
Copy link
Contributor

@GregoryKimball GregoryKimball Aug 7, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would prefer to omit this skipping condition. I recognize that we can't have 1M distinct elements in 1K rows, but this condition adds a lot of friction when sweeping NumRows for the high cardinality case. It forces me to run a full factorial of matching NumRows and Cardinality values and filter the outputs for the highest Cardinality unskipped for each NumRows.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll rewrite this logic. Thanks for the feedback!

Copy link
Contributor Author

@bdice bdice Aug 7, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm. @GregoryKimball I reviewed the NVBench docs and I don't see a way to filter out certain jobs except by skipping them. https://github.com/NVIDIA/nvbench/blob/main/docs/benchmarks.md#beware-combinatorial-explosion-is-lurking

We might be able to use a string axis like {"100,100", "100,1000", ..., "1000000000,1000000000"} and parse it, but that's hard to maintain.

Copy link
Member

@PointKernel PointKernel Aug 7, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't see a way to filter out certain jobs except by skipping them.

NVIDIA/nvbench#80 can solve this issue but the PR has been stalled for a while.


data_profile profile = data_profile_builder().cardinality(0).null_probability(0.01).distribution(
cudf::type_to_id<Type>(), distribution_id::UNIFORM, 0, 100);
data_profile profile = data_profile_builder()
.cardinality(cardinality)
.null_probability(0.01)
.distribution(cudf::type_to_id<Type>(),
distribution_id::UNIFORM,
static_cast<Type>(0),
std::numeric_limits<Type>::max());

auto source_column = create_random_column(cudf::type_to_id<Type>(), row_count{num_rows}, profile);

Expand All @@ -40,27 +69,29 @@ void nvbench_distinct(nvbench::state& state, nvbench::type_list<Type>)

state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
auto result = cudf::distinct(input_table,
{0},
cudf::duplicate_keep_option::KEEP_ANY,
cudf::null_equality::EQUAL,
cudf::nan_equality::ALL_EQUAL);
auto result = cudf::distinct(
input_table, {0}, keep, cudf::null_equality::EQUAL, cudf::nan_equality::ALL_EQUAL);
});
}

using data_type = nvbench::type_list<bool, int8_t, int32_t, int64_t, float, cudf::timestamp_ms>;
using data_type = nvbench::type_list<int32_t, int64_t>;

NVBENCH_BENCH_TYPES(nvbench_distinct, NVBENCH_TYPE_AXES(data_type))
.set_name("distinct")
.set_type_axes_names({"Type"})
.add_int64_axis("NumRows", {10'000, 100'000, 1'000'000, 10'000'000});
.add_string_axis("keep", {"any", "first", "last", "none"})
.add_int64_axis("cardinality",
{100, 1'000, 10'000, 100'000, 1'000'000, 10'000'000, 100'000'000, 1'000'000'000})
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can just decrease the default values in the axis.

.add_int64_axis("NumRows",
{100, 1'000, 10'000, 100'000, 1'000'000, 10'000'000, 100'000'000, 1'000'000'000});

template <typename Type>
void nvbench_distinct_list(nvbench::state& state, nvbench::type_list<Type>)
{
auto const size = state.get_int64("ColumnSize");
auto const dtype = cudf::type_to_id<Type>();
double const null_probability = state.get_float64("null_probability");
auto const keep = get_keep(state.get_string("keep"));

auto builder = data_profile_builder().null_probability(null_probability);
if (dtype == cudf::type_id::LIST) {
Expand All @@ -80,17 +111,15 @@ void nvbench_distinct_list(nvbench::state& state, nvbench::type_list<Type>)

state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
auto result = cudf::distinct(*table,
{0},
cudf::duplicate_keep_option::KEEP_ANY,
cudf::null_equality::EQUAL,
cudf::nan_equality::ALL_EQUAL);
auto result =
cudf::distinct(*table, {0}, keep, cudf::null_equality::EQUAL, cudf::nan_equality::ALL_EQUAL);
});
}

NVBENCH_BENCH_TYPES(nvbench_distinct_list,
NVBENCH_TYPE_AXES(nvbench::type_list<int32_t, cudf::list_view>))
.set_name("distinct_list")
.set_type_axes_names({"Type"})
.add_string_axis("keep", {"any", "first", "last", "none"})
.add_float64_axis("null_probability", {0.0, 0.1})
.add_int64_axis("ColumnSize", {100'000'000});
61 changes: 45 additions & 16 deletions cpp/benchmarks/stream_compaction/stable_distinct.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
* Copyright (c) 2023-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -23,15 +23,44 @@

#include <nvbench/nvbench.cuh>

#include <limits>

NVBENCH_DECLARE_TYPE_STRINGS(cudf::timestamp_ms, "cudf::timestamp_ms", "cudf::timestamp_ms");

cudf::duplicate_keep_option get_keep(std::string const& keep_str)
{
if (keep_str == "any") {
return cudf::duplicate_keep_option::KEEP_ANY;
} else if (keep_str == "first") {
return cudf::duplicate_keep_option::KEEP_FIRST;
} else if (keep_str == "last") {
return cudf::duplicate_keep_option::KEEP_LAST;
} else if (keep_str == "none") {
return cudf::duplicate_keep_option::KEEP_NONE;
} else {
CUDF_FAIL("Unsupported keep option.");
}
}

template <typename Type>
void nvbench_stable_distinct(nvbench::state& state, nvbench::type_list<Type>)
{
cudf::size_type const num_rows = state.get_int64("NumRows");
cudf::size_type const num_rows = state.get_int64("NumRows");
auto const keep = get_keep(state.get_string("keep"));
cudf::size_type const cardinality = state.get_int64("cardinality");

if (cardinality > num_rows) {
state.skip("cardinality > num_rows");
return;
}

data_profile profile = data_profile_builder().cardinality(0).null_probability(0.01).distribution(
cudf::type_to_id<Type>(), distribution_id::UNIFORM, 0, 100);
data_profile profile = data_profile_builder()
.cardinality(cardinality)
.null_probability(0.01)
.distribution(cudf::type_to_id<Type>(),
distribution_id::UNIFORM,
static_cast<Type>(0),
std::numeric_limits<Type>::max());

auto source_column = create_random_column(cudf::type_to_id<Type>(), row_count{num_rows}, profile);

Expand All @@ -40,27 +69,29 @@ void nvbench_stable_distinct(nvbench::state& state, nvbench::type_list<Type>)

state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
auto result = cudf::stable_distinct(input_table,
{0},
cudf::duplicate_keep_option::KEEP_ANY,
cudf::null_equality::EQUAL,
cudf::nan_equality::ALL_EQUAL);
auto result = cudf::stable_distinct(
input_table, {0}, keep, cudf::null_equality::EQUAL, cudf::nan_equality::ALL_EQUAL);
});
}

using data_type = nvbench::type_list<bool, int8_t, int32_t, int64_t, float, cudf::timestamp_ms>;
using data_type = nvbench::type_list<int32_t, int64_t>;

NVBENCH_BENCH_TYPES(nvbench_stable_distinct, NVBENCH_TYPE_AXES(data_type))
.set_name("stable_distinct")
.set_type_axes_names({"Type"})
.add_int64_axis("NumRows", {10'000, 100'000, 1'000'000, 10'000'000});
.add_string_axis("keep", {"any", "first", "last", "none"})
.add_int64_axis("cardinality",
{100, 1'000, 10'000, 100'000, 1'000'000, 10'000'000, 100'000'000, 1'000'000'000})
.add_int64_axis("NumRows",
{100, 1'000, 10'000, 100'000, 1'000'000, 10'000'000, 100'000'000, 1'000'000'000});

template <typename Type>
void nvbench_stable_distinct_list(nvbench::state& state, nvbench::type_list<Type>)
{
auto const size = state.get_int64("ColumnSize");
auto const dtype = cudf::type_to_id<Type>();
double const null_probability = state.get_float64("null_probability");
auto const keep = get_keep(state.get_string("keep"));

auto builder = data_profile_builder().null_probability(null_probability);
if (dtype == cudf::type_id::LIST) {
Expand All @@ -80,17 +111,15 @@ void nvbench_stable_distinct_list(nvbench::state& state, nvbench::type_list<Type

state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
auto result = cudf::stable_distinct(*table,
{0},
cudf::duplicate_keep_option::KEEP_ANY,
cudf::null_equality::EQUAL,
cudf::nan_equality::ALL_EQUAL);
auto result = cudf::stable_distinct(
*table, {0}, keep, cudf::null_equality::EQUAL, cudf::nan_equality::ALL_EQUAL);
});
}

NVBENCH_BENCH_TYPES(nvbench_stable_distinct_list,
NVBENCH_TYPE_AXES(nvbench::type_list<int32_t, cudf::list_view>))
.set_name("stable_distinct_list")
.set_type_axes_names({"Type"})
.add_string_axis("keep", {"any", "first", "last", "none"})
.add_float64_axis("null_probability", {0.0, 0.1})
.add_int64_axis("ColumnSize", {100'000'000});