Skip to content

chore: enable order_by and limit for new compiler #1815

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jun 16, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 18 additions & 2 deletions bigframes/core/compile/sqlglot/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,9 +125,25 @@ def _compile_result_node(self, root: nodes.ResultNode) -> str:
(name, scalar_compiler.compile_scalar_expression(ref))
for ref, name in root.output_cols
)
sqlglot_ir = sqlglot_ir.select(selected_cols)
# Skip squashing selections to ensure the right ordering and limit keys
sqlglot_ir = sqlglot_ir.select(selected_cols, squash_selections=False)

if root.order_by is not None:
ordering_cols = tuple(
sge.Ordered(
this=scalar_compiler.compile_scalar_expression(
ordering.scalar_expression
),
desc=ordering.direction.is_ascending is False,
nulls_first=ordering.na_last is False,
)
for ordering in root.order_by.all_ordering_columns
)
sqlglot_ir = sqlglot_ir.order_by(ordering_cols)

if root.limit is not None:
sqlglot_ir = sqlglot_ir.limit(root.limit)

# TODO: add order_by, limit to sqlglot_expr
return sqlglot_ir.sql

@functools.lru_cache(maxsize=5000)
Expand Down
46 changes: 36 additions & 10 deletions bigframes/core/compile/sqlglot/sqlglot_ir.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
from bigframes.core import guid
import bigframes.core.compile.sqlglot.sqlglot_types as sgt
import bigframes.core.local_data as local_data
import bigframes.core.schema as schemata
import bigframes.core.schema as bf_schema

# shapely.wkt.dumps was moved to shapely.io.to_wkt in 2.0.
try:
Expand Down Expand Up @@ -67,7 +67,7 @@ def sql(self) -> str:
def from_pyarrow(
cls,
pa_table: pa.Table,
schema: schemata.ArraySchema,
schema: bf_schema.ArraySchema,
uid_gen: guid.SequentialUIDGenerator,
) -> SQLGlotIR:
"""Builds SQLGlot expression from pyarrow table."""
Expand Down Expand Up @@ -203,6 +203,7 @@ def from_union(
def select(
self,
selected_cols: tuple[tuple[str, sge.Expression], ...],
squash_selections: bool = True,
) -> SQLGlotIR:
selections = [
sge.Alias(
Expand All @@ -211,15 +212,39 @@ def select(
)
for id, expr in selected_cols
]
# Attempts to simplify selected columns when the original and new column
# names are simply aliases of each other.
squashed_selections = _squash_selections(self.expr.expressions, selections)
if squashed_selections != []:
new_expr = self.expr.select(*squashed_selections, append=False)
return SQLGlotIR(expr=new_expr, uid_gen=self.uid_gen)

# If squashing is enabled, we try to simplify the selections
# by checking if the new selections are simply aliases of the
# original columns.
if squash_selections:
new_selections = _squash_selections(self.expr.expressions, selections)
if new_selections != []:
new_expr = self.expr.select(*new_selections, append=False)
return SQLGlotIR(expr=new_expr, uid_gen=self.uid_gen)

new_expr = self._encapsulate_as_cte().select(*selections, append=False)
return SQLGlotIR(expr=new_expr, uid_gen=self.uid_gen)

def order_by(
self,
ordering: tuple[sge.Ordered, ...],
) -> SQLGlotIR:
"""Adds ORDER BY clause to the query."""
if len(ordering) == 0:
return SQLGlotIR(expr=self.expr.copy(), uid_gen=self.uid_gen)
new_expr = self.expr.order_by(*ordering)
return SQLGlotIR(expr=new_expr, uid_gen=self.uid_gen)

def limit(
self,
limit: int | None,
) -> SQLGlotIR:
"""Adds LIMIT clause to the query."""
if limit is not None:
new_expr = self.expr.limit(limit)
else:
new_expr = self._encapsulate_as_cte().select(*selections, append=False)
return SQLGlotIR(expr=new_expr, uid_gen=self.uid_gen)
new_expr = self.expr.copy()
return SQLGlotIR(expr=new_expr, uid_gen=self.uid_gen)

def project(
self,
Expand Down Expand Up @@ -342,6 +367,7 @@ def _squash_selections(
old_expr: list[sge.Expression], new_expr: list[sge.Alias]
) -> list[sge.Alias]:
"""
TODO: Reanble this function to optimize the SQL.
Simplifies the select column expressions if existing (old_expr) and
new (new_expr) selected columns are both simple aliases of column definitions.

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -104,4 +104,7 @@ SELECT
`bfcol_47` AS `rowindex_1`,
`bfcol_48` AS `int64_col`,
`bfcol_49` AS `string_col`
FROM `bfcte_12`
FROM `bfcte_12`
ORDER BY
`bfcol_50` ASC NULLS LAST,
`bfcol_51` ASC NULLS LAST
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,19 @@ WITH `bfcte_0` AS (
`bfcol_4` AS `bfcol_8`,
`bfcol_1` + 1 AS `bfcol_9`
FROM `bfcte_0`
), `bfcte_2` AS (
SELECT
`bfcol_5` AS `bfcol_10`,
`bfcol_9` AS `bfcol_11`,
`bfcol_6` AS `bfcol_12`,
`bfcol_7` AS `bfcol_13`,
`bfcol_8` AS `bfcol_14`
FROM `bfcte_1`
)
SELECT
`bfcol_5` AS `rowindex`,
`bfcol_9` AS `int64_col`,
`bfcol_6` AS `string_col`,
`bfcol_7` AS `float64_col`,
`bfcol_8` AS `bool_col`
FROM `bfcte_1`
`bfcol_10` AS `rowindex`,
`bfcol_11` AS `int64_col`,
`bfcol_12` AS `string_col`,
`bfcol_13` AS `float64_col`,
`bfcol_14` AS `bool_col`
FROM `bfcte_2`
Original file line number Diff line number Diff line change
Expand Up @@ -155,21 +155,42 @@ WITH `bfcte_0` AS (
CAST(NULL AS TIMESTAMP),
8
)])
), `bfcte_1` AS (
SELECT
`bfcol_0` AS `bfcol_16`,
`bfcol_1` AS `bfcol_17`,
`bfcol_2` AS `bfcol_18`,
`bfcol_3` AS `bfcol_19`,
`bfcol_4` AS `bfcol_20`,
`bfcol_5` AS `bfcol_21`,
`bfcol_6` AS `bfcol_22`,
`bfcol_7` AS `bfcol_23`,
`bfcol_8` AS `bfcol_24`,
`bfcol_9` AS `bfcol_25`,
`bfcol_10` AS `bfcol_26`,
`bfcol_11` AS `bfcol_27`,
`bfcol_12` AS `bfcol_28`,
`bfcol_13` AS `bfcol_29`,
`bfcol_14` AS `bfcol_30`,
`bfcol_15` AS `bfcol_31`
FROM `bfcte_0`
)
SELECT
`bfcol_0` AS `rowindex`,
`bfcol_1` AS `bool_col`,
`bfcol_2` AS `bytes_col`,
`bfcol_3` AS `date_col`,
`bfcol_4` AS `datetime_col`,
`bfcol_5` AS `geography_col`,
`bfcol_6` AS `int64_col`,
`bfcol_7` AS `int64_too`,
`bfcol_8` AS `numeric_col`,
`bfcol_9` AS `float64_col`,
`bfcol_10` AS `rowindex_1`,
`bfcol_11` AS `rowindex_2`,
`bfcol_12` AS `string_col`,
`bfcol_13` AS `time_col`,
`bfcol_14` AS `timestamp_col`
FROM `bfcte_0`
`bfcol_16` AS `rowindex`,
`bfcol_17` AS `bool_col`,
`bfcol_18` AS `bytes_col`,
`bfcol_19` AS `date_col`,
`bfcol_20` AS `datetime_col`,
`bfcol_21` AS `geography_col`,
`bfcol_22` AS `int64_col`,
`bfcol_23` AS `int64_too`,
`bfcol_24` AS `numeric_col`,
`bfcol_25` AS `float64_col`,
`bfcol_26` AS `rowindex_1`,
`bfcol_27` AS `rowindex_2`,
`bfcol_28` AS `string_col`,
`bfcol_29` AS `time_col`,
`bfcol_30` AS `timestamp_col`
FROM `bfcte_1`
ORDER BY
`bfcol_31` ASC NULLS LAST
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,14 @@ WITH `bfcte_0` AS (
SELECT
*
FROM UNNEST(ARRAY<STRUCT<`bfcol_0` JSON, `bfcol_1` INT64>>[STRUCT(PARSE_JSON('null'), 0), STRUCT(PARSE_JSON('true'), 1), STRUCT(PARSE_JSON('100'), 2), STRUCT(PARSE_JSON('0.98'), 3), STRUCT(PARSE_JSON('"a string"'), 4), STRUCT(PARSE_JSON('[]'), 5), STRUCT(PARSE_JSON('[1,2,3]'), 6), STRUCT(PARSE_JSON('[{"a":1},{"a":2},{"a":null},{}]'), 7), STRUCT(PARSE_JSON('"100"'), 8), STRUCT(PARSE_JSON('{"date":"2024-07-16"}'), 9), STRUCT(PARSE_JSON('{"int_value":2,"null_filed":null}'), 10), STRUCT(PARSE_JSON('{"list_data":[10,20,30]}'), 11)])
), `bfcte_1` AS (
SELECT
`bfcol_0` AS `bfcol_2`,
`bfcol_1` AS `bfcol_3`
FROM `bfcte_0`
)
SELECT
`bfcol_0` AS `json_col`
FROM `bfcte_0`
`bfcol_2` AS `json_col`
FROM `bfcte_1`
ORDER BY
`bfcol_3` ASC NULLS LAST
Original file line number Diff line number Diff line change
Expand Up @@ -32,14 +32,28 @@ WITH `bfcte_0` AS (
['', 'a'],
2
)])
), `bfcte_1` AS (
SELECT
`bfcol_0` AS `bfcol_9`,
`bfcol_1` AS `bfcol_10`,
`bfcol_2` AS `bfcol_11`,
`bfcol_3` AS `bfcol_12`,
`bfcol_4` AS `bfcol_13`,
`bfcol_5` AS `bfcol_14`,
`bfcol_6` AS `bfcol_15`,
`bfcol_7` AS `bfcol_16`,
`bfcol_8` AS `bfcol_17`
FROM `bfcte_0`
)
SELECT
`bfcol_0` AS `rowindex`,
`bfcol_1` AS `int_list_col`,
`bfcol_2` AS `bool_list_col`,
`bfcol_3` AS `float_list_col`,
`bfcol_4` AS `date_list_col`,
`bfcol_5` AS `date_time_list_col`,
`bfcol_6` AS `numeric_list_col`,
`bfcol_7` AS `string_list_col`
FROM `bfcte_0`
`bfcol_9` AS `rowindex`,
`bfcol_10` AS `int_list_col`,
`bfcol_11` AS `bool_list_col`,
`bfcol_12` AS `float_list_col`,
`bfcol_13` AS `date_list_col`,
`bfcol_14` AS `date_time_list_col`,
`bfcol_15` AS `numeric_list_col`,
`bfcol_16` AS `string_list_col`
FROM `bfcte_1`
ORDER BY
`bfcol_17` ASC NULLS LAST
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,16 @@ WITH `bfcte_0` AS (
),
1
)])
), `bfcte_1` AS (
SELECT
`bfcol_0` AS `bfcol_3`,
`bfcol_1` AS `bfcol_4`,
`bfcol_2` AS `bfcol_5`
FROM `bfcte_0`
)
SELECT
`bfcol_0` AS `id`,
`bfcol_1` AS `person`
FROM `bfcte_0`
`bfcol_3` AS `id`,
`bfcol_4` AS `person`
FROM `bfcte_1`
ORDER BY
`bfcol_5` ASC NULLS LAST
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
WITH `bfcte_0` AS (
SELECT
`rowindex` AS `bfcol_0`,
`int64_col` AS `bfcol_1`,
`string_col` AS `bfcol_2`,
`float64_col` AS `bfcol_3`,
`bool_col` AS `bfcol_4`
FROM `test-project`.`test_dataset`.`test_table`
), `bfcte_1` AS (
SELECT
*,
`bfcol_1` AS `bfcol_5`
FROM `bfcte_0`
)
SELECT
`bfcol_0` AS `rowindex`,
`bfcol_1` AS `int64_col`,
`bfcol_2` AS `string_col`,
`bfcol_3` AS `float64_col`,
`bfcol_4` AS `bool_col`
FROM `bfcte_1`
ORDER BY
`bfcol_5` ASC NULLS LAST
LIMIT 10
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
WITH `bfcte_0` AS (
SELECT
`rowindex` AS `bfcol_0`,
`int64_col` AS `bfcol_1`,
`string_col` AS `bfcol_2`,
`float64_col` AS `bfcol_3`,
`bool_col` AS `bfcol_4`
FROM `test-project`.`test_dataset`.`test_table`
), `bfcte_1` AS (
SELECT
`bfcol_0` AS `bfcol_5`,
`bfcol_1` AS `bfcol_6`,
`bfcol_2` AS `bfcol_7`,
`bfcol_3` AS `bfcol_8`,
`bfcol_4` AS `bfcol_9`
FROM `bfcte_0`
), `bfcte_2` AS (
SELECT
*,
`bfcol_5` AS `bfcol_10`
FROM `bfcte_1`
), `bfcte_3` AS (
SELECT
`bfcol_5` AS `bfcol_11`,
`bfcol_6` AS `bfcol_12`,
`bfcol_7` AS `bfcol_13`,
`bfcol_8` AS `bfcol_14`,
`bfcol_9` AS `bfcol_15`,
`bfcol_10` AS `bfcol_16`
FROM `bfcte_2`
)
SELECT
`bfcol_11` AS `rowindex`,
`bfcol_12` AS `int64_col`,
`bfcol_13` AS `string_col`,
`bfcol_14` AS `float64_col`,
`bfcol_15` AS `bool_col`
FROM `bfcte_3`
ORDER BY
`bfcol_16` ASC NULLS LAST
12 changes: 12 additions & 0 deletions tests/unit/core/compile/sqlglot/test_compile_readtable.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,15 @@
def test_compile_readtable(compiler_session: bigframes.Session, snapshot):
bf_df = compiler_session.read_gbq_table("test-project.test_dataset.test_table")
snapshot.assert_match(bf_df.sql, "out.sql")


def test_compile_readtable_w_ordering(compiler_session: bigframes.Session, snapshot):
bf_df = compiler_session.read_gbq_table("test-project.test_dataset.test_table")
bf_df = bf_df.set_index("rowindex").sort_index()
snapshot.assert_match(bf_df.sql, "out.sql")


def test_compile_readtable_w_limit(compiler_session: bigframes.Session, snapshot):
bf_df = compiler_session.read_gbq_table("test-project.test_dataset.test_table")
bf_df = bf_df.sort_values("int64_col").head(10)
snapshot.assert_match(bf_df.sql, "out.sql")