Skip to content

Commit 9e28c17

Browse files
committed
Revert "Temporarily fix bug in dynamic top-k optimization (apache#16465)"
This reverts commit 5ca4ff0.
1 parent 2bf8441 commit 9e28c17

File tree

2 files changed

+46
-43
lines changed

2 files changed

+46
-43
lines changed

datafusion/core/tests/fuzz_cases/sort_query_fuzz.rs

Lines changed: 0 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,6 @@ use datafusion_execution::memory_pool::{
3131
};
3232
use datafusion_expr::display_schema;
3333
use datafusion_physical_plan::spill::get_record_batch_memory_size;
34-
use itertools::Itertools;
3534
use std::time::Duration;
3635

3736
use datafusion_execution::{memory_pool::FairSpillPool, runtime_env::RuntimeEnvBuilder};
@@ -73,43 +72,6 @@ async fn sort_query_fuzzer_runner() {
7372
fuzzer.run().await.unwrap();
7473
}
7574

76-
/// Reproduce the bug with specific seeds from the
77-
/// [failing test case](https://github.com/apache/datafusion/issues/16452).
78-
#[tokio::test(flavor = "multi_thread")]
79-
async fn test_reproduce_sort_query_issue_16452() {
80-
// Seeds from the failing test case
81-
let init_seed = 10313160656544581998u64;
82-
let query_seed = 15004039071976572201u64;
83-
let config_seed_1 = 11807432710583113300u64;
84-
let config_seed_2 = 759937414670321802u64;
85-
86-
let random_seed = 1u64; // Use a fixed seed to ensure consistent behavior
87-
88-
let mut test_generator = SortFuzzerTestGenerator::new(
89-
2000,
90-
3,
91-
"sort_fuzz_table".to_string(),
92-
get_supported_types_columns(random_seed),
93-
false,
94-
random_seed,
95-
);
96-
97-
let mut results = vec![];
98-
99-
for config_seed in [config_seed_1, config_seed_2] {
100-
let r = test_generator
101-
.fuzzer_run(init_seed, query_seed, config_seed)
102-
.await
103-
.unwrap();
104-
105-
results.push(r);
106-
}
107-
108-
for (lhs, rhs) in results.iter().tuple_windows() {
109-
check_equality_of_batches(lhs, rhs).unwrap();
110-
}
111-
}
112-
11375
/// SortQueryFuzzer holds the runner configuration for executing sort query fuzz tests. The fuzzing details are managed inside `SortFuzzerTestGenerator`.
11476
///
11577
/// It defines:

datafusion/physical-plan/src/topk/mod.rs

Lines changed: 46 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,8 @@
1818
//! TopK: Combination of Sort / LIMIT
1919
2020
use arrow::{
21-
array::Array,
22-
compute::interleave_record_batch,
21+
array::{Array, AsArray},
22+
compute::{interleave_record_batch, prep_null_mask_filter, FilterBuilder},
2323
row::{RowConverter, Rows, SortField},
2424
};
2525
use datafusion_expr::{ColumnarValue, Operator};
@@ -203,7 +203,7 @@ impl TopK {
203203
let baseline = self.metrics.baseline.clone();
204204
let _timer = baseline.elapsed_compute().timer();
205205

206-
let sort_keys: Vec<ArrayRef> = self
206+
let mut sort_keys: Vec<ArrayRef> = self
207207
.expr
208208
.iter()
209209
.map(|expr| {
@@ -212,15 +212,56 @@ impl TopK {
212212
})
213213
.collect::<Result<Vec<_>>>()?;
214214

215+
let mut selected_rows = None;
216+
217+
if let Some(filter) = self.filter.as_ref() {
218+
// If a filter is provided, update it with the new rows
219+
let filter = filter.current()?;
220+
let filtered = filter.evaluate(&batch)?;
221+
let num_rows = batch.num_rows();
222+
let array = filtered.into_array(num_rows)?;
223+
let mut filter = array.as_boolean().clone();
224+
let true_count = filter.true_count();
225+
if true_count == 0 {
226+
// nothing to filter, so no need to update
227+
return Ok(());
228+
}
229+
// only update the keys / rows if the filter does not match all rows
230+
if true_count < num_rows {
231+
// Indices in `set_indices` should be correct if filter contains nulls
232+
// So we prepare the filter here. Note this is also done in the `FilterBuilder`
233+
// so there is no overhead to do this here.
234+
if filter.nulls().is_some() {
235+
filter = prep_null_mask_filter(&filter);
236+
}
237+
238+
let filter_predicate = FilterBuilder::new(&filter);
239+
let filter_predicate = if sort_keys.len() > 1 {
240+
// Optimize filter when it has multiple sort keys
241+
filter_predicate.optimize().build()
242+
} else {
243+
filter_predicate.build()
244+
};
245+
selected_rows = Some(filter);
246+
sort_keys = sort_keys
247+
.iter()
248+
.map(|key| filter_predicate.filter(key).map_err(|x| x.into()))
249+
.collect::<Result<Vec<_>>>()?;
250+
}
251+
};
215252
// reuse existing `Rows` to avoid reallocations
216253
let rows = &mut self.scratch_rows;
217254
rows.clear();
218255
self.row_converter.append(rows, &sort_keys)?;
219256

220257
let mut batch_entry = self.heap.register_batch(batch.clone());
221258

222-
let replacements =
223-
self.find_new_topk_items(0..sort_keys[0].len(), &mut batch_entry);
259+
let replacements = match selected_rows {
260+
Some(filter) => {
261+
self.find_new_topk_items(filter.values().set_indices(), &mut batch_entry)
262+
}
263+
None => self.find_new_topk_items(0..sort_keys[0].len(), &mut batch_entry),
264+
};
224265

225266
if replacements > 0 {
226267
self.metrics.row_replacements.add(replacements);

0 commit comments

Comments
 (0)