Skip to content

feat!: add allow_large_results option to read_gbq_query, aligning with bpd.options.compute.allow_large_results option #1935

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 24 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
cdec057
feat: add `allow_large_results` option to `read_gbq_query`
tswast Jul 24, 2025
3b78da7
add system test
tswast Jul 24, 2025
ad11cc8
Merge remote-tracking branch 'origin/main' into allow_large_results
tswast Jul 24, 2025
68f1a10
add to pandas module
tswast Jul 24, 2025
b1e31e3
Merge branch 'main' into allow_large_results
tswast Aug 20, 2025
3b770ae
default to global option
tswast Aug 20, 2025
d01cdcb
Merge remote-tracking branch 'origin/allow_large_results' into allow_…
tswast Aug 20, 2025
10a8302
fix unit test
tswast Aug 20, 2025
938fb89
tweak imports so I can manually run doctest
tswast Aug 21, 2025
78d29f0
support index_col and columns
tswast Aug 21, 2025
435c602
add system tests and fix pandas warning
tswast Aug 21, 2025
7199fee
Merge remote-tracking branch 'origin/main' into allow_large_results
tswast Aug 21, 2025
778746f
use global option in remaining functions
tswast Aug 21, 2025
766640a
Merge remote-tracking branch 'origin/main' into allow_large_results
tswast Aug 21, 2025
95b2fdf
supply allow_large_results=False when max_results is set
tswast Aug 21, 2025
05c145c
fix last? failing test
tswast Aug 21, 2025
e87b548
Merge branch 'main' into allow_large_results
tswast Aug 21, 2025
58f45f8
fix vector search tests
tswast Aug 21, 2025
d611337
Merge remote-tracking branch 'origin/allow_large_results' into allow_…
tswast Aug 21, 2025
6d0fe48
fix vector search tests again
tswast Aug 21, 2025
8c27c2c
fix arima tests
tswast Aug 21, 2025
5ab21a9
fix more tests
tswast Aug 21, 2025
2ca9e9e
try again
tswast Aug 21, 2025
0c8b88c
and again
tswast Aug 21, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion bigframes/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -4419,7 +4419,7 @@ def to_dict(
allow_large_results: Optional[bool] = None,
**kwargs,
) -> dict | list[dict]:
return self.to_pandas(allow_large_results=allow_large_results).to_dict(orient, into, **kwargs) # type: ignore
return self.to_pandas(allow_large_results=allow_large_results).to_dict(orient=orient, into=into, **kwargs) # type: ignore

def to_excel(
self,
Expand Down
28 changes: 24 additions & 4 deletions bigframes/pandas/io/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,7 @@ def read_gbq( # type: ignore[overload-overlap]
use_cache: Optional[bool] = ...,
col_order: Iterable[str] = ...,
dry_run: Literal[False] = ...,
allow_large_results: Optional[bool] = ...,
) -> bigframes.dataframe.DataFrame:
...

Expand All @@ -203,6 +204,7 @@ def read_gbq(
use_cache: Optional[bool] = ...,
col_order: Iterable[str] = ...,
dry_run: Literal[True] = ...,
allow_large_results: Optional[bool] = ...,
) -> pandas.Series:
...

Expand All @@ -218,6 +220,7 @@ def read_gbq(
use_cache: Optional[bool] = None,
col_order: Iterable[str] = (),
dry_run: bool = False,
allow_large_results: Optional[bool] = None,
) -> bigframes.dataframe.DataFrame | pandas.Series:
_set_default_session_location_if_possible(query_or_table)
return global_session.with_default_session(
Expand All @@ -231,6 +234,7 @@ def read_gbq(
use_cache=use_cache,
col_order=col_order,
dry_run=dry_run,
allow_large_results=allow_large_results,
)


Expand Down Expand Up @@ -400,6 +404,7 @@ def read_gbq_query( # type: ignore[overload-overlap]
col_order: Iterable[str] = ...,
filters: vendored_pandas_gbq.FiltersType = ...,
dry_run: Literal[False] = ...,
allow_large_results: Optional[bool] = ...,
) -> bigframes.dataframe.DataFrame:
...

Expand All @@ -416,6 +421,7 @@ def read_gbq_query(
col_order: Iterable[str] = ...,
filters: vendored_pandas_gbq.FiltersType = ...,
dry_run: Literal[True] = ...,
allow_large_results: Optional[bool] = ...,
) -> pandas.Series:
...

Expand All @@ -431,6 +437,7 @@ def read_gbq_query(
col_order: Iterable[str] = (),
filters: vendored_pandas_gbq.FiltersType = (),
dry_run: bool = False,
allow_large_results: Optional[bool] = None,
) -> bigframes.dataframe.DataFrame | pandas.Series:
_set_default_session_location_if_possible(query)
return global_session.with_default_session(
Expand All @@ -444,6 +451,7 @@ def read_gbq_query(
col_order=col_order,
filters=filters,
dry_run=dry_run,
allow_large_results=allow_large_results,
)


Expand Down Expand Up @@ -617,7 +625,11 @@ def from_glob_path(


def _get_bqclient() -> bigquery.Client:
clients_provider = bigframes.session.clients.ClientsProvider(
# Address circular imports in doctest due to bigframes/session/__init__.py
# containing a lot of logic and samples.
from bigframes.session import clients

clients_provider = clients.ClientsProvider(
project=config.options.bigquery.project,
location=config.options.bigquery.location,
use_regional_endpoints=config.options.bigquery.use_regional_endpoints,
Expand All @@ -631,11 +643,15 @@ def _get_bqclient() -> bigquery.Client:


def _dry_run(query, bqclient) -> bigquery.QueryJob:
# Address circular imports in doctest due to bigframes/session/__init__.py
# containing a lot of logic and samples.
from bigframes.session import metrics as bf_metrics

job = bqclient.query(query, bigquery.QueryJobConfig(dry_run=True))

# Fix for b/435183833. Log metrics even if a Session isn't available.
if bigframes.session.metrics.LOGGING_NAME_ENV_VAR in os.environ:
metrics = bigframes.session.metrics.ExecutionMetrics()
if bf_metrics.LOGGING_NAME_ENV_VAR in os.environ:
metrics = bf_metrics.ExecutionMetrics()
metrics.count_job_stats(job)
return job

Expand All @@ -645,6 +661,10 @@ def _set_default_session_location_if_possible(query):


def _set_default_session_location_if_possible_deferred_query(create_query):
# Address circular imports in doctest due to bigframes/session/__init__.py
# containing a lot of logic and samples.
from bigframes.session._io import bigquery

# Set the location as per the query if this is the first query the user is
# running and:
# (1) Default session has not started yet, and
Expand All @@ -666,7 +686,7 @@ def _set_default_session_location_if_possible_deferred_query(create_query):
query = create_query()
bqclient = _get_bqclient()

if bigframes.session._io.bigquery.is_query(query):
if bigquery.is_query(query):
# Intentionally run outside of the session so that we can detect the
# location before creating the session. Since it's a dry_run, labels
# aren't necessary.
Expand Down
125 changes: 105 additions & 20 deletions bigframes/session/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@

from bigframes import exceptions as bfe
from bigframes import version
import bigframes._config
import bigframes._config.bigquery_options as bigquery_options
import bigframes.clients
import bigframes.constants
Expand Down Expand Up @@ -132,6 +133,10 @@ def __init__(
context: Optional[bigquery_options.BigQueryOptions] = None,
clients_provider: Optional[bigframes.session.clients.ClientsProvider] = None,
):
# Address circular imports in doctest due to bigframes/session/__init__.py
# containing a lot of logic and samples.
from bigframes.session import anonymous_dataset, clients, loader, metrics

_warn_if_bf_version_is_obsolete()

if context is None:
Expand Down Expand Up @@ -167,7 +172,7 @@ def __init__(
if clients_provider:
self._clients_provider = clients_provider
else:
self._clients_provider = bigframes.session.clients.ClientsProvider(
self._clients_provider = clients.ClientsProvider(
project=context.project,
location=self._location,
use_regional_endpoints=context.use_regional_endpoints,
Expand Down Expand Up @@ -219,15 +224,13 @@ def __init__(
else bigframes.enums.DefaultIndexKind.NULL
)

self._metrics = bigframes.session.metrics.ExecutionMetrics()
self._metrics = metrics.ExecutionMetrics()
self._function_session = bff_session.FunctionSession()
self._anon_dataset_manager = (
bigframes.session.anonymous_dataset.AnonymousDatasetManager(
self._clients_provider.bqclient,
location=self._location,
session_id=self._session_id,
kms_key=self._bq_kms_key_name,
)
self._anon_dataset_manager = anonymous_dataset.AnonymousDatasetManager(
self._clients_provider.bqclient,
location=self._location,
session_id=self._session_id,
kms_key=self._bq_kms_key_name,
)
# Session temp tables don't support specifying kms key, so use anon dataset if kms key specified
self._session_resource_manager = (
Expand All @@ -241,7 +244,7 @@ def __init__(
self._temp_storage_manager = (
self._session_resource_manager or self._anon_dataset_manager
)
self._loader = bigframes.session.loader.GbqDataLoader(
self._loader = loader.GbqDataLoader(
session=self,
bqclient=self._clients_provider.bqclient,
storage_manager=self._temp_storage_manager,
Expand Down Expand Up @@ -395,6 +398,7 @@ def read_gbq( # type: ignore[overload-overlap]
use_cache: Optional[bool] = ...,
col_order: Iterable[str] = ...,
dry_run: Literal[False] = ...,
allow_large_results: Optional[bool] = ...,
) -> dataframe.DataFrame:
...

Expand All @@ -411,6 +415,7 @@ def read_gbq(
use_cache: Optional[bool] = ...,
col_order: Iterable[str] = ...,
dry_run: Literal[True] = ...,
allow_large_results: Optional[bool] = ...,
) -> pandas.Series:
...

Expand All @@ -425,8 +430,8 @@ def read_gbq(
filters: third_party_pandas_gbq.FiltersType = (),
use_cache: Optional[bool] = None,
col_order: Iterable[str] = (),
dry_run: bool = False
# Add a verify index argument that fails if the index is not unique.
dry_run: bool = False,
allow_large_results: Optional[bool] = None,
) -> dataframe.DataFrame | pandas.Series:
# TODO(b/281571214): Generate prompt to show the progress of read_gbq.
if columns and col_order:
Expand All @@ -436,6 +441,9 @@ def read_gbq(
elif col_order:
columns = col_order

if allow_large_results is None:
allow_large_results = bigframes._config.options._allow_large_results

if bf_io_bigquery.is_query(query_or_table):
return self._loader.read_gbq_query( # type: ignore # for dry_run overload
query_or_table,
Expand All @@ -446,6 +454,7 @@ def read_gbq(
use_cache=use_cache,
filters=filters,
dry_run=dry_run,
allow_large_results=allow_large_results,
)
else:
if configuration is not None:
Expand Down Expand Up @@ -521,6 +530,8 @@ def _read_gbq_colab(
if pyformat_args is None:
pyformat_args = {}

allow_large_results = bigframes._config.options._allow_large_results

query = bigframes.core.pyformat.pyformat(
query,
pyformat_args=pyformat_args,
Expand All @@ -533,10 +544,7 @@ def _read_gbq_colab(
index_col=bigframes.enums.DefaultIndexKind.NULL,
force_total_order=False,
dry_run=typing.cast(Union[Literal[False], Literal[True]], dry_run),
# TODO(tswast): we may need to allow allow_large_results to be overwritten
# or possibly a general configuration object for an explicit
# destination table and write disposition.
allow_large_results=False,
allow_large_results=allow_large_results,
)

@overload
Expand All @@ -552,6 +560,7 @@ def read_gbq_query( # type: ignore[overload-overlap]
col_order: Iterable[str] = ...,
filters: third_party_pandas_gbq.FiltersType = ...,
dry_run: Literal[False] = ...,
allow_large_results: Optional[bool] = ...,
) -> dataframe.DataFrame:
...

Expand All @@ -568,6 +577,7 @@ def read_gbq_query(
col_order: Iterable[str] = ...,
filters: third_party_pandas_gbq.FiltersType = ...,
dry_run: Literal[True] = ...,
allow_large_results: Optional[bool] = ...,
) -> pandas.Series:
...

Expand All @@ -583,6 +593,7 @@ def read_gbq_query(
col_order: Iterable[str] = (),
filters: third_party_pandas_gbq.FiltersType = (),
dry_run: bool = False,
allow_large_results: Optional[bool] = None,
) -> dataframe.DataFrame | pandas.Series:
"""Turn a SQL query into a DataFrame.

Expand Down Expand Up @@ -632,9 +643,48 @@ def read_gbq_query(

See also: :meth:`Session.read_gbq`.

Args:
query (str):
A SQL query to execute.
index_col (Iterable[str] or str, optional):
The column(s) to use as the index for the DataFrame. This can be
a single column name or a list of column names. If not provided,
a default index will be used.
columns (Iterable[str], optional):
The columns to read from the query result. If not
specified, all columns will be read.
configuration (dict, optional):
A dictionary of query job configuration options. See the
BigQuery REST API documentation for a list of available options:
https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs#configuration.query
max_results (int, optional):
The maximum number of rows to retrieve from the query
result. If not specified, all rows will be loaded.
use_cache (bool, optional):
Whether to use cached results for the query. Defaults to ``True``.
Setting this to ``False`` will force a re-execution of the query.
col_order (Iterable[str], optional):
The desired order of columns in the resulting DataFrame. This
parameter is deprecated and will be removed in a future version.
Use ``columns`` instead.
filters (list[tuple], optional):
A list of filters to apply to the data. Filters are specified
as a list of tuples, where each tuple contains a column name,
an operator (e.g., '==', '!='), and a value.
dry_run (bool, optional):
If ``True``, the function will not actually execute the query but
will instead return statistics about the query. Defaults to
``False``.
allow_large_results (bool, optional):
Whether to allow large query results. If ``True``, the query
results can be larger than the maximum response size.
Defaults to ``bpd.options.compute.allow_large_results``.

Returns:
bigframes.pandas.DataFrame:
A DataFrame representing results of the query or table.
bigframes.pandas.DataFrame or pandas.Series:
A DataFrame representing the result of the query. If ``dry_run``
is ``True``, a ``pandas.Series`` containing query statistics is
returned.

Raises:
ValueError:
Expand All @@ -649,6 +699,9 @@ def read_gbq_query(
elif col_order:
columns = col_order

if allow_large_results is None:
allow_large_results = bigframes._config.options._allow_large_results

return self._loader.read_gbq_query( # type: ignore # for dry_run overload
query=query,
index_col=index_col,
Expand All @@ -658,6 +711,7 @@ def read_gbq_query(
use_cache=use_cache,
filters=filters,
dry_run=dry_run,
allow_large_results=allow_large_results,
)

@overload
Expand Down Expand Up @@ -715,9 +769,40 @@ def read_gbq_table(

See also: :meth:`Session.read_gbq`.

Args:
table_id (str):
The identifier of the BigQuery table to read.
index_col (Iterable[str] or str, optional):
The column(s) to use as the index for the DataFrame. This can be
a single column name or a list of column names. If not provided,
a default index will be used.
columns (Iterable[str], optional):
The columns to read from the table. If not specified, all
columns will be read.
max_results (int, optional):
The maximum number of rows to retrieve from the table. If not
specified, all rows will be loaded.
filters (list[tuple], optional):
A list of filters to apply to the data. Filters are specified
as a list of tuples, where each tuple contains a column name,
an operator (e.g., '==', '!='), and a value.
use_cache (bool, optional):
Whether to use cached results for the query. Defaults to ``True``.
Setting this to ``False`` will force a re-execution of the query.
col_order (Iterable[str], optional):
The desired order of columns in the resulting DataFrame. This
parameter is deprecated and will be removed in a future version.
Use ``columns`` instead.
dry_run (bool, optional):
If ``True``, the function will not actually execute the query but
will instead return statistics about the table. Defaults to
``False``.

Returns:
bigframes.pandas.DataFrame:
A DataFrame representing results of the query or table.
bigframes.pandas.DataFrame or pandas.Series:
A DataFrame representing the contents of the table. If
``dry_run`` is ``True``, a ``pandas.Series`` containing table
statistics is returned.

Raises:
ValueError:
Expand Down
Loading