Skip to content

Commit 44b3f30

Browse files
committed
refactor: enhance test helper functions for improved readability and consistency
1 parent 00b1fe0 commit 44b3f30

File tree

1 file changed

+81
-175
lines changed

1 file changed

+81
-175
lines changed

datafusion/core/tests/sql/aggregates.rs

Lines changed: 81 additions & 175 deletions
Original file line numberDiff line numberDiff line change
@@ -248,7 +248,32 @@ impl TestData {
248248
}
249249
}
250250

251-
/// Sets up test context and tables for both single and multiple partition scenarios
251+
/// ## Test Helper Function Guide
252+
///
253+
/// This file provides several helper functions to reduce repetitive test patterns:
254+
///
255+
/// ### For tests using TestData:
256+
/// - `run_complete_snapshot_test()` - Complete test: setup → SQL → assert_snapshot!
257+
/// - `run_complete_sorted_snapshot_test()` - Same as above but with sorted output
258+
/// - `run_snapshot_test()` - Setup and execute, returns results for custom assertions
259+
///
260+
/// ### For custom setups:
261+
/// - `run_simple_snapshot_test()` - Execute SQL on existing context, returns results
262+
///
263+
/// ### Supporting functions:
264+
/// - `create_test_dict()` - Create dictionary arrays with slices (preferred)
265+
/// - `setup_test_contexts()` - Setup single and multi-partition contexts
266+
/// - `test_query_consistency()` - Execute and verify consistency across partitions
267+
///
268+
/// ### Usage examples:
269+
/// ```rust
270+
/// // Simple complete test
271+
/// run_complete_snapshot_test(&test_data, "SELECT * FROM t", @"expected output").await?;
272+
///
273+
/// // Multiple tests with different data
274+
/// let results = run_snapshot_test(&test_data, "SELECT * FROM t", false).await?;
275+
/// assert_snapshot!(batches_to_string(&results), @"expected");
276+
/// ```
252277
async fn setup_test_contexts(
253278
test_data: &TestData,
254279
) -> Result<(SessionContext, SessionContext)> {
@@ -762,11 +787,8 @@ async fn test_aggregates_null_handling_comprehensive() -> Result<()> {
762787
let test_data_median = TestData::new_for_median();
763788

764789
// Test COUNT null exclusion with basic data
765-
let (ctx_single, ctx_multi) = setup_test_contexts(&test_data_basic).await?;
766-
767790
let sql_count = "SELECT dict_null_keys, COUNT(value) as cnt FROM t GROUP BY dict_null_keys ORDER BY dict_null_keys NULLS FIRST";
768-
let results_count =
769-
test_query_consistency(&ctx_single, &ctx_multi, sql_count).await?;
791+
let results_count = run_snapshot_test(&test_data_basic, sql_count, false).await?;
770792

771793
assert_snapshot!(
772794
batches_to_string(&results_count),
@@ -782,12 +804,8 @@ async fn test_aggregates_null_handling_comprehensive() -> Result<()> {
782804
);
783805

784806
// Test SUM null handling with extended data
785-
let (ctx_single_sum, ctx_multi_sum) =
786-
setup_test_contexts(&test_data_extended).await?;
787-
788807
let sql_sum = "SELECT dict_null_vals, SUM(value) as total FROM t GROUP BY dict_null_vals ORDER BY dict_null_vals NULLS FIRST";
789-
let results_sum =
790-
test_query_consistency(&ctx_single_sum, &ctx_multi_sum, sql_sum).await?;
808+
let results_sum = run_snapshot_test(&test_data_extended, sql_sum, false).await?;
791809

792810
assert_snapshot!(
793811
batches_to_string(&results_sum),
@@ -804,11 +822,8 @@ async fn test_aggregates_null_handling_comprehensive() -> Result<()> {
804822
);
805823

806824
// Test MIN null handling with min/max data
807-
let (ctx_single_min, ctx_multi_min) = setup_test_contexts(&test_data_min_max).await?;
808-
809825
let sql_min = "SELECT dict_null_keys, MIN(value) as minimum FROM t GROUP BY dict_null_keys ORDER BY dict_null_keys NULLS FIRST";
810-
let results_min =
811-
test_query_consistency(&ctx_single_min, &ctx_multi_min, sql_min).await?;
826+
let results_min = run_snapshot_test(&test_data_min_max, sql_min, false).await?;
812827

813828
assert_snapshot!(
814829
batches_to_string(&results_min),
@@ -825,12 +840,8 @@ async fn test_aggregates_null_handling_comprehensive() -> Result<()> {
825840
);
826841

827842
// Test MEDIAN null handling with median data
828-
let (ctx_single_median, ctx_multi_median) =
829-
setup_test_contexts(&test_data_median).await?;
830-
831843
let sql_median = "SELECT dict_null_vals, MEDIAN(value) as median_value FROM t GROUP BY dict_null_vals ORDER BY dict_null_vals NULLS FIRST";
832-
let results_median =
833-
test_query_consistency(&ctx_single_median, &ctx_multi_median, sql_median).await?;
844+
let results_median = run_snapshot_test(&test_data_median, sql_median, false).await?;
834845

835846
assert_snapshot!(
836847
batches_to_string(&results_median),
@@ -852,12 +863,11 @@ async fn test_aggregates_null_handling_comprehensive() -> Result<()> {
852863
#[tokio::test]
853864
async fn test_first_last_val_null_handling() -> Result<()> {
854865
let test_data = TestData::new_for_first_last();
855-
let (ctx_single, ctx_multi) = setup_test_contexts(&test_data).await?;
856866

857867
// Test FIRST_VALUE and LAST_VALUE with window functions over groups
858868
let sql = "SELECT dict_null_keys, value, FIRST_VALUE(value) OVER (PARTITION BY dict_null_keys ORDER BY value NULLS FIRST) as first_val, LAST_VALUE(value) OVER (PARTITION BY dict_null_keys ORDER BY value NULLS FIRST ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) as last_val FROM t ORDER BY dict_null_keys NULLS FIRST, value NULLS FIRST";
859869

860-
let results_single = test_query_consistency(&ctx_single, &ctx_multi, sql).await?;
870+
let results_single = run_snapshot_test(&test_data, sql, false).await?;
861871

862872
assert_snapshot!(batches_to_string(&results_single), @r"
863873
+----------------+-------+-----------+----------+
@@ -1142,7 +1152,6 @@ async fn create_fuzz_context_with_partitions(
11421152

11431153
Ok(ctx)
11441154
}
1145-
11461155
/// Splits fuzz test data into multiple batches for partitioning
11471156
fn split_fuzz_data_into_batches(
11481157
test_data: &FuzzTestData,
@@ -1213,161 +1222,11 @@ async fn test_max_with_fuzz_table_dict_nulls() -> Result<()> {
12131222
| 3 | | str_e | str_e | |
12141223
| 3 | dict_c | str_f | str_f | value_6 |
12151224
+--------+---------------------+----------+-------+---------+
1216-
"
1217-
);
1225+
");
12181226

12191227
Ok(())
12201228
}
12211229

1222-
/// Test data structure for fuzz table with timestamp and dictionary columns containing nulls
1223-
struct FuzzTimestampTestData {
1224-
schema: Arc<Schema>,
1225-
utf8_low: StringArray,
1226-
u8_low: UInt8Array,
1227-
dictionary_utf8_low: DictionaryArray<UInt32Type>,
1228-
timestamp_us: TimestampMicrosecondArray,
1229-
}
1230-
1231-
impl FuzzTimestampTestData {
1232-
fn new() -> Self {
1233-
// Create dictionary columns with null keys and values
1234-
let dictionary_utf8_low = create_test_dict(
1235-
&[Some("dict_x"), None, Some("dict_y"), Some("dict_z")],
1236-
&[
1237-
Some(0), // dict_x
1238-
Some(1), // null value
1239-
Some(2), // dict_y
1240-
None, // null key
1241-
Some(0), // dict_x
1242-
Some(1), // null value
1243-
Some(3), // dict_z
1244-
None, // null key
1245-
Some(2), // dict_y
1246-
],
1247-
);
1248-
1249-
let utf8_low = StringArray::from(vec![
1250-
Some("alpha"),
1251-
Some("beta"),
1252-
Some("gamma"),
1253-
Some("delta"),
1254-
Some("alpha"),
1255-
Some("epsilon"),
1256-
Some("zeta"),
1257-
Some("delta"),
1258-
Some("gamma"),
1259-
]);
1260-
1261-
let u8_low = UInt8Array::from(vec![
1262-
Some(10),
1263-
Some(20),
1264-
Some(30),
1265-
Some(20),
1266-
Some(10),
1267-
Some(40),
1268-
Some(30),
1269-
Some(20),
1270-
Some(30),
1271-
]);
1272-
1273-
// Create timestamp data with some nulls
1274-
let timestamp_us = TimestampMicrosecondArray::from(vec![
1275-
Some(1000000), // 1970-01-01 00:00:01
1276-
Some(2000000), // 1970-01-01 00:00:02
1277-
Some(3000000), // 1970-01-01 00:00:03
1278-
None, // null timestamp
1279-
Some(1500000), // 1970-01-01 00:00:01.5
1280-
Some(4000000), // 1970-01-01 00:00:04
1281-
Some(2500000), // 1970-01-01 00:00:02.5
1282-
Some(3500000), // 1970-01-01 00:00:03.5
1283-
Some(2800000), // 1970-01-01 00:00:02.8
1284-
]);
1285-
1286-
let schema = Arc::new(Schema::new(vec![
1287-
Field::new("utf8_low", DataType::Utf8, true),
1288-
Field::new("u8_low", DataType::UInt8, true),
1289-
Field::new("dictionary_utf8_low", string_dict_type(), true),
1290-
Field::new(
1291-
"timestamp_us",
1292-
DataType::Timestamp(TimeUnit::Microsecond, None),
1293-
true,
1294-
),
1295-
]));
1296-
1297-
Self {
1298-
schema,
1299-
utf8_low,
1300-
u8_low,
1301-
dictionary_utf8_low,
1302-
timestamp_us,
1303-
}
1304-
}
1305-
}
1306-
1307-
/// Sets up test contexts for fuzz table with timestamps and both single and multiple partitions
1308-
async fn setup_fuzz_timestamp_test_contexts() -> Result<(SessionContext, SessionContext)>
1309-
{
1310-
let test_data = FuzzTimestampTestData::new();
1311-
1312-
// Single partition context
1313-
let ctx_single = create_fuzz_timestamp_context_with_partitions(&test_data, 1).await?;
1314-
1315-
// Multiple partition context
1316-
let ctx_multi = create_fuzz_timestamp_context_with_partitions(&test_data, 3).await?;
1317-
1318-
Ok((ctx_single, ctx_multi))
1319-
}
1320-
1321-
/// Creates a session context with fuzz timestamp table partitioned into specified number of partitions
1322-
async fn create_fuzz_timestamp_context_with_partitions(
1323-
test_data: &FuzzTimestampTestData,
1324-
num_partitions: usize,
1325-
) -> Result<SessionContext> {
1326-
let ctx = SessionContext::new_with_config(
1327-
SessionConfig::new().with_target_partitions(num_partitions),
1328-
);
1329-
1330-
let batches = split_fuzz_timestamp_data_into_batches(test_data, num_partitions)?;
1331-
let provider = MemTable::try_new(test_data.schema.clone(), batches)?;
1332-
ctx.register_table("fuzz_table", Arc::new(provider))?;
1333-
1334-
Ok(ctx)
1335-
}
1336-
1337-
/// Splits fuzz timestamp test data into multiple batches for partitioning
1338-
fn split_fuzz_timestamp_data_into_batches(
1339-
test_data: &FuzzTimestampTestData,
1340-
num_partitions: usize,
1341-
) -> Result<Vec<Vec<RecordBatch>>> {
1342-
debug_assert!(num_partitions > 0, "num_partitions must be greater than 0");
1343-
let total_len = test_data.utf8_low.len();
1344-
let chunk_size = total_len.div_ceil(num_partitions);
1345-
1346-
let mut batches = Vec::new();
1347-
let mut start = 0;
1348-
1349-
while start < total_len {
1350-
let end = min(start + chunk_size, total_len);
1351-
let len = end - start;
1352-
1353-
if len > 0 {
1354-
let batch = RecordBatch::try_new(
1355-
test_data.schema.clone(),
1356-
vec![
1357-
Arc::new(test_data.utf8_low.slice(start, len)),
1358-
Arc::new(test_data.u8_low.slice(start, len)),
1359-
Arc::new(test_data.dictionary_utf8_low.slice(start, len)),
1360-
Arc::new(test_data.timestamp_us.slice(start, len)),
1361-
],
1362-
)?;
1363-
batches.push(vec![batch]);
1364-
}
1365-
start = end;
1366-
}
1367-
1368-
Ok(batches)
1369-
}
1370-
13711230
/// Test MIN with fuzz table containing dictionary columns with null keys and values and timestamp data (single and multiple partitions)
13721231
#[tokio::test]
13731232
async fn test_min_timestamp_with_fuzz_table_dict_nulls() -> Result<()> {
@@ -1547,7 +1406,6 @@ async fn create_fuzz_count_context_with_partitions(
15471406

15481407
Ok(ctx)
15491408
}
1550-
15511409
/// Splits fuzz count test data into multiple batches for partitioning
15521410
fn split_fuzz_count_data_into_batches(
15531411
test_data: &FuzzCountTestData,
@@ -1791,7 +1649,6 @@ async fn create_fuzz_median_context_with_partitions(
17911649

17921650
Ok(ctx)
17931651
}
1794-
17951652
/// Splits fuzz median test data into multiple batches for partitioning
17961653
fn split_fuzz_median_data_into_batches(
17971654
test_data: &FuzzMedianTestData,
@@ -1869,3 +1726,52 @@ async fn test_median_distinct_with_fuzz_table_dict_nulls() -> Result<()> {
18691726

18701727
Ok(())
18711728
}
1729+
1730+
/// Helper function to run snapshot tests with consistent setup, execution, and assertion
1731+
/// This reduces the repetitive pattern of "setup data → SQL → assert_snapshot!"
1732+
async fn run_snapshot_test(
1733+
test_data: &TestData,
1734+
sql: &str,
1735+
use_sorted_output: bool,
1736+
) -> Result<Vec<RecordBatch>> {
1737+
let (ctx_single, ctx_multi) = setup_test_contexts(test_data).await?;
1738+
let results = test_query_consistency(&ctx_single, &ctx_multi, sql).await?;
1739+
Ok(results)
1740+
}
1741+
1742+
/// Helper function for simpler snapshot tests that only need single-partition execution
1743+
async fn run_simple_snapshot_test(
1744+
ctx: &SessionContext,
1745+
sql: &str,
1746+
) -> Result<Vec<RecordBatch>> {
1747+
let df = ctx.sql(sql).await?;
1748+
let results = df.collect().await?;
1749+
Ok(results)
1750+
}
1751+
1752+
/// Helper function to run a complete snapshot test with TestData
1753+
/// This fully encapsulates the "setup data → SQL → assert_snapshot!" pattern
1754+
async fn run_complete_snapshot_test(
1755+
test_data: &TestData,
1756+
sql: &str,
1757+
expected_snapshot: &str,
1758+
) -> Result<()> {
1759+
let results = run_snapshot_test(test_data, sql, false).await?;
1760+
1761+
assert_snapshot!(batches_to_string(&results), expected_snapshot);
1762+
1763+
Ok(())
1764+
}
1765+
1766+
/// Helper function to run a complete snapshot test with sorted output
1767+
async fn run_complete_sorted_snapshot_test(
1768+
test_data: &TestData,
1769+
sql: &str,
1770+
expected_snapshot: &str,
1771+
) -> Result<()> {
1772+
let results = run_snapshot_test(test_data, sql, true).await?;
1773+
1774+
assert_snapshot!(batches_to_sort_string(&results), expected_snapshot);
1775+
1776+
Ok(())
1777+
}

0 commit comments

Comments
 (0)