Skip to content

Commit cf9c22a

Browse files
perf: Produce simpler sql (#1836)
1 parent 0fffc49 commit cf9c22a

File tree

19 files changed

+269
-234
lines changed

19 files changed

+269
-234
lines changed

bigframes/core/compile/compiler.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ def compile_sql(request: configs.CompileRequest) -> configs.CompileResult:
6565
ordering: Optional[bf_ordering.RowOrdering] = result_node.order_by
6666
result_node = dataclasses.replace(result_node, order_by=None)
6767
result_node = cast(nodes.ResultNode, rewrites.column_pruning(result_node))
68+
result_node = cast(nodes.ResultNode, rewrites.defer_selection(result_node))
6869
sql = compile_result_node(result_node)
6970
# Return the ordering iff no extra columns are needed to define the row order
7071
if ordering is not None:

bigframes/core/compile/googlesql/query.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,7 @@ def sql(self) -> str:
125125
return "\n".join(text)
126126

127127

128-
@dataclasses.dataclass
128+
@dataclasses.dataclass(frozen=True)
129129
class SelectExpression(abc.SQLSyntax):
130130
"""This class represents `select_expression`."""
131131

bigframes/core/compile/sqlglot/compiler.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,9 @@ def _compile_sql(self, request: configs.CompileRequest) -> configs.CompileResult
8787
nodes.ResultNode, rewrite.column_pruning(result_node)
8888
)
8989
result_node = self._remap_variables(result_node)
90+
result_node = typing.cast(
91+
nodes.ResultNode, rewrite.defer_selection(result_node)
92+
)
9093
sql = self._compile_result_node(result_node)
9194
return configs.CompileResult(
9295
sql, result_node.schema.to_bigquery(), result_node.order_by
@@ -97,6 +100,9 @@ def _compile_sql(self, request: configs.CompileRequest) -> configs.CompileResult
97100
result_node = typing.cast(nodes.ResultNode, rewrite.column_pruning(result_node))
98101

99102
result_node = self._remap_variables(result_node)
103+
result_node = typing.cast(
104+
nodes.ResultNode, rewrite.defer_selection(result_node)
105+
)
100106
sql = self._compile_result_node(result_node)
101107
# Return the ordering iff no extra columns are needed to define the row order
102108
if ordering is not None:

bigframes/core/nodes.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ def additive_base(self) -> BigFrameNode:
7575
...
7676

7777
@abc.abstractmethod
78-
def replace_additive_base(self, BigFrameNode):
78+
def replace_additive_base(self, BigFrameNode) -> BigFrameNode:
7979
...
8080

8181

@@ -1568,6 +1568,10 @@ class ExplodeNode(UnaryNode):
15681568
# Offsets are generated only if this is non-null
15691569
offsets_col: Optional[identifiers.ColumnId] = None
15701570

1571+
def _validate(self):
1572+
for col in self.column_ids:
1573+
assert col.id in self.child.ids
1574+
15711575
@property
15721576
def row_preserving(self) -> bool:
15731577
return False
@@ -1646,6 +1650,10 @@ class ResultNode(UnaryNode):
16461650
limit: Optional[int] = None
16471651
# TODO: CTE definitions
16481652

1653+
def _validate(self):
1654+
for ref, name in self.output_cols:
1655+
assert ref.id in self.child.ids
1656+
16491657
@property
16501658
def node_defined_ids(self) -> Tuple[identifiers.ColumnId, ...]:
16511659
return ()

bigframes/core/rewrite/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
try_reduce_to_local_scan,
2323
try_reduce_to_table_scan,
2424
)
25+
from bigframes.core.rewrite.select_pullup import defer_selection
2526
from bigframes.core.rewrite.slices import pull_out_limit, pull_up_limits, rewrite_slice
2627
from bigframes.core.rewrite.timedeltas import rewrite_timedelta_expressions
2728
from bigframes.core.rewrite.windows import pull_out_window_order, rewrite_range_rolling
@@ -42,4 +43,5 @@
4243
"try_reduce_to_local_scan",
4344
"fold_row_counts",
4445
"pull_out_window_order",
46+
"defer_selection",
4547
]
Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import dataclasses
16+
from typing import cast
17+
18+
from bigframes.core import expression, nodes
19+
20+
21+
def defer_selection(
22+
root: nodes.BigFrameNode,
23+
) -> nodes.BigFrameNode:
24+
"""
25+
Defers SelectionNode operations in the tree, pulling them up.
26+
27+
In many cases, these nodes will be merged or eliminated entirely, simplifying the overall tree.
28+
"""
29+
return nodes.bottom_up(root, pull_up_select)
30+
31+
32+
def pull_up_select(node: nodes.BigFrameNode) -> nodes.BigFrameNode:
33+
if isinstance(node, nodes.LeafNode):
34+
return node
35+
if isinstance(node, nodes.JoinNode):
36+
return pull_up_selects_under_join(node)
37+
if isinstance(node, nodes.ConcatNode):
38+
return handle_selects_under_concat(node)
39+
if isinstance(node, nodes.UnaryNode):
40+
return pull_up_select_unary(node)
41+
# shouldn't hit this, but not worth crashing over
42+
return node
43+
44+
45+
def pull_up_select_unary(node: nodes.UnaryNode) -> nodes.BigFrameNode:
46+
child = node.child
47+
if not isinstance(child, nodes.SelectionNode):
48+
return node
49+
50+
# Schema-preserving nodes
51+
if isinstance(
52+
node,
53+
(
54+
nodes.ReversedNode,
55+
nodes.OrderByNode,
56+
nodes.SliceNode,
57+
nodes.FilterNode,
58+
nodes.RandomSampleNode,
59+
),
60+
):
61+
pushed_down_node: nodes.BigFrameNode = node.remap_refs(
62+
{id: ref.id for ref, id in child.input_output_pairs}
63+
).replace_child(child.child)
64+
pulled_up_select = cast(
65+
nodes.SelectionNode, child.replace_child(pushed_down_node)
66+
)
67+
return pulled_up_select
68+
elif isinstance(
69+
node,
70+
(
71+
nodes.SelectionNode,
72+
nodes.ResultNode,
73+
),
74+
):
75+
return node.remap_refs(
76+
{id: ref.id for ref, id in child.input_output_pairs}
77+
).replace_child(child.child)
78+
elif isinstance(node, nodes.AggregateNode):
79+
pushed_down_agg = node.remap_refs(
80+
{id: ref.id for ref, id in child.input_output_pairs}
81+
).replace_child(child.child)
82+
new_selection = tuple(
83+
nodes.AliasedRef.identity(id).remap_refs(
84+
{id: ref.id for ref, id in child.input_output_pairs}
85+
)
86+
for id in node.ids
87+
)
88+
return nodes.SelectionNode(pushed_down_agg, new_selection)
89+
elif isinstance(node, nodes.ExplodeNode):
90+
pushed_down_node = node.remap_refs(
91+
{id: ref.id for ref, id in child.input_output_pairs}
92+
).replace_child(child.child)
93+
pulled_up_select = cast(
94+
nodes.SelectionNode, child.replace_child(pushed_down_node)
95+
)
96+
if node.offsets_col:
97+
pulled_up_select = dataclasses.replace(
98+
pulled_up_select,
99+
input_output_pairs=(
100+
*pulled_up_select.input_output_pairs,
101+
nodes.AliasedRef(
102+
expression.DerefOp(node.offsets_col), node.offsets_col
103+
),
104+
),
105+
)
106+
return pulled_up_select
107+
elif isinstance(node, nodes.AdditiveNode):
108+
pushed_down_node = node.replace_additive_base(child.child).remap_refs(
109+
{id: ref.id for ref, id in child.input_output_pairs}
110+
)
111+
new_selection = (
112+
*child.input_output_pairs,
113+
*(
114+
nodes.AliasedRef(expression.DerefOp(col.id), col.id)
115+
for col in node.added_fields
116+
),
117+
)
118+
pulled_up_select = dataclasses.replace(
119+
child, child=pushed_down_node, input_output_pairs=new_selection
120+
)
121+
return pulled_up_select
122+
# shouldn't hit this, but not worth crashing over
123+
return node
124+
125+
126+
def pull_up_selects_under_join(node: nodes.JoinNode) -> nodes.JoinNode:
127+
# Can in theory pull up selects here, but it is a bit dangerous, in particular or self-joins, when there are more transforms to do.
128+
# TODO: Safely pull up selects above join
129+
return node
130+
131+
132+
def handle_selects_under_concat(node: nodes.ConcatNode) -> nodes.ConcatNode:
133+
new_children = []
134+
for child in node.child_nodes:
135+
# remove select if no-op
136+
if not isinstance(child, nodes.SelectionNode):
137+
new_children.append(child)
138+
else:
139+
inputs = (ref.id for ref in child.input_output_pairs)
140+
if inputs == tuple(child.child.ids):
141+
new_children.append(child.child)
142+
else:
143+
new_children.append(child)
144+
return dataclasses.replace(node, children=tuple(new_children))

bigframes/core/utils.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,12 @@ def label_to_identifier(label: typing.Hashable, strict: bool = False) -> str:
148148
# first character must be letter or underscore
149149
identifier = "_" + identifier
150150

151+
else:
152+
# Even with flexible column names, there are constraints
153+
# Convert illegal characters
154+
# See: https://cloud.google.com/bigquery/docs/schemas#flexible-column-names
155+
identifier = re.sub(r"[!\"$\(\)\*\,\./;\?@[\]^`{}~]", "_", identifier)
156+
151157
# Except in special circumstances (true anonymous query results tables),
152158
# field names are not allowed to start with these (case-insensitive)
153159
# prefixes.

tests/unit/core/compile/sqlglot/snapshots/test_compile_concat/test_compile_concat/out.sql

Lines changed: 23 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -4,79 +4,47 @@ WITH `bfcte_1` AS (
44
FROM UNNEST(ARRAY<STRUCT<`bfcol_0` INT64, `bfcol_1` INT64, `bfcol_2` INT64, `bfcol_3` STRING, `bfcol_4` INT64>>[STRUCT(0, 123456789, 0, 'Hello, World!', 0), STRUCT(1, -987654321, 1, 'こんにちは', 1), STRUCT(2, 314159, 2, ' ¡Hola Mundo! ', 2), STRUCT(3, CAST(NULL AS INT64), 3, CAST(NULL AS STRING), 3), STRUCT(4, -234892, 4, 'Hello, World!', 4), STRUCT(5, 55555, 5, 'Güten Tag!', 5), STRUCT(6, 101202303, 6, 'capitalize, This ', 6), STRUCT(7, -214748367, 7, ' سلام', 7), STRUCT(8, 2, 8, 'T', 8)])
55
), `bfcte_3` AS (
66
SELECT
7-
`bfcol_0` AS `bfcol_5`,
8-
`bfcol_2` AS `bfcol_6`,
9-
`bfcol_1` AS `bfcol_7`,
10-
`bfcol_3` AS `bfcol_8`,
11-
`bfcol_4` AS `bfcol_9`
7+
*,
8+
`bfcol_4` AS `bfcol_10`
129
FROM `bfcte_1`
1310
), `bfcte_5` AS (
14-
SELECT
15-
*,
16-
`bfcol_9` AS `bfcol_10`
17-
FROM `bfcte_3`
18-
), `bfcte_7` AS (
19-
SELECT
20-
`bfcol_5` AS `bfcol_11`,
21-
`bfcol_6` AS `bfcol_12`,
22-
`bfcol_7` AS `bfcol_13`,
23-
`bfcol_8` AS `bfcol_14`,
24-
`bfcol_10` AS `bfcol_15`
25-
FROM `bfcte_5`
26-
), `bfcte_9` AS (
2711
SELECT
2812
*,
2913
0 AS `bfcol_16`
30-
FROM `bfcte_7`
31-
), `bfcte_10` AS (
14+
FROM `bfcte_3`
15+
), `bfcte_6` AS (
3216
SELECT
33-
`bfcol_11` AS `bfcol_17`,
34-
`bfcol_12` AS `bfcol_18`,
35-
`bfcol_13` AS `bfcol_19`,
36-
`bfcol_14` AS `bfcol_20`,
17+
`bfcol_0` AS `bfcol_17`,
18+
`bfcol_2` AS `bfcol_18`,
19+
`bfcol_1` AS `bfcol_19`,
20+
`bfcol_3` AS `bfcol_20`,
3721
`bfcol_16` AS `bfcol_21`,
38-
`bfcol_15` AS `bfcol_22`
39-
FROM `bfcte_9`
22+
`bfcol_10` AS `bfcol_22`
23+
FROM `bfcte_5`
4024
), `bfcte_0` AS (
4125
SELECT
4226
*
4327
FROM UNNEST(ARRAY<STRUCT<`bfcol_23` INT64, `bfcol_24` INT64, `bfcol_25` INT64, `bfcol_26` STRING, `bfcol_27` INT64>>[STRUCT(0, 123456789, 0, 'Hello, World!', 0), STRUCT(1, -987654321, 1, 'こんにちは', 1), STRUCT(2, 314159, 2, ' ¡Hola Mundo! ', 2), STRUCT(3, CAST(NULL AS INT64), 3, CAST(NULL AS STRING), 3), STRUCT(4, -234892, 4, 'Hello, World!', 4), STRUCT(5, 55555, 5, 'Güten Tag!', 5), STRUCT(6, 101202303, 6, 'capitalize, This ', 6), STRUCT(7, -214748367, 7, ' سلام', 7), STRUCT(8, 2, 8, 'T', 8)])
4428
), `bfcte_2` AS (
4529
SELECT
46-
`bfcol_23` AS `bfcol_28`,
47-
`bfcol_25` AS `bfcol_29`,
48-
`bfcol_24` AS `bfcol_30`,
49-
`bfcol_26` AS `bfcol_31`,
50-
`bfcol_27` AS `bfcol_32`
30+
*,
31+
`bfcol_27` AS `bfcol_33`
5132
FROM `bfcte_0`
5233
), `bfcte_4` AS (
5334
SELECT
5435
*,
55-
`bfcol_32` AS `bfcol_33`
36+
1 AS `bfcol_39`
5637
FROM `bfcte_2`
57-
), `bfcte_6` AS (
38+
), `bfcte_7` AS (
5839
SELECT
59-
`bfcol_28` AS `bfcol_34`,
60-
`bfcol_29` AS `bfcol_35`,
61-
`bfcol_30` AS `bfcol_36`,
62-
`bfcol_31` AS `bfcol_37`,
63-
`bfcol_33` AS `bfcol_38`
40+
`bfcol_23` AS `bfcol_40`,
41+
`bfcol_25` AS `bfcol_41`,
42+
`bfcol_24` AS `bfcol_42`,
43+
`bfcol_26` AS `bfcol_43`,
44+
`bfcol_39` AS `bfcol_44`,
45+
`bfcol_33` AS `bfcol_45`
6446
FROM `bfcte_4`
6547
), `bfcte_8` AS (
66-
SELECT
67-
*,
68-
1 AS `bfcol_39`
69-
FROM `bfcte_6`
70-
), `bfcte_11` AS (
71-
SELECT
72-
`bfcol_34` AS `bfcol_40`,
73-
`bfcol_35` AS `bfcol_41`,
74-
`bfcol_36` AS `bfcol_42`,
75-
`bfcol_37` AS `bfcol_43`,
76-
`bfcol_39` AS `bfcol_44`,
77-
`bfcol_38` AS `bfcol_45`
78-
FROM `bfcte_8`
79-
), `bfcte_12` AS (
8048
SELECT
8149
*
8250
FROM (
@@ -87,7 +55,7 @@ WITH `bfcte_1` AS (
8755
bfcol_20 AS `bfcol_49`,
8856
bfcol_21 AS `bfcol_50`,
8957
bfcol_22 AS `bfcol_51`
90-
FROM `bfcte_10`
58+
FROM `bfcte_6`
9159
UNION ALL
9260
SELECT
9361
bfcol_40 AS `bfcol_46`,
@@ -96,15 +64,15 @@ WITH `bfcte_1` AS (
9664
bfcol_43 AS `bfcol_49`,
9765
bfcol_44 AS `bfcol_50`,
9866
bfcol_45 AS `bfcol_51`
99-
FROM `bfcte_11`
67+
FROM `bfcte_7`
10068
)
10169
)
10270
SELECT
10371
`bfcol_46` AS `rowindex`,
10472
`bfcol_47` AS `rowindex_1`,
10573
`bfcol_48` AS `int64_col`,
10674
`bfcol_49` AS `string_col`
107-
FROM `bfcte_12`
75+
FROM `bfcte_8`
10876
ORDER BY
10977
`bfcol_50` ASC NULLS LAST,
11078
`bfcol_51` ASC NULLS LAST

0 commit comments

Comments
 (0)