Skip to content
This repository was archived by the owner on Apr 1, 2026. It is now read-only.
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 24 additions & 2 deletions bigframes/core/pyformat.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@

import string
import typing
from typing import Any, Optional, Union
from typing import Any, Optional, Tuple, Union

import google.cloud.bigquery
import pandas
Expand All @@ -39,7 +39,11 @@


def _table_to_sql(table: _BQ_TABLE_TYPES) -> str:
return f"`{table.project}`.`{table.dataset_id}`.`{table.table_id}`"
# BiglakeIcebergTable IDs have 4 parts. BigFrames packs catalog.namespace
# into the dataset_id.
dataset_parts = table.dataset_id.split(".")
dataset_sql = ".".join(f"`{part}`" for part in dataset_parts)
return f"`{table.project}`.{dataset_sql}.`{table.table_id}`"


def _pandas_df_to_sql_dry_run(pd_df: pandas.DataFrame) -> str:
Expand Down Expand Up @@ -102,6 +106,24 @@ def _field_to_template_value(
return _pandas_df_to_sql(value, session=session, dry_run=dry_run, name=name)

if isinstance(value, bigframes.dataframe.DataFrame):
import bigframes.core.bq_data as bq_data
import bigframes.core.nodes as nodes

# TODO(b/493608478): Remove this workaround for BigLake/Iceberg tables,
# which cannot currently be used in views, once a fix rolls out.
def is_biglake(
node: nodes.BigFrameNode, child_results: Tuple[bool, ...]
) -> bool:
if isinstance(node, nodes.ReadTableNode):
return isinstance(node.source.table, bq_data.BiglakeIcebergTable)
return any(child_results)

contains_biglake = value._block.expr.node.reduce_up(is_biglake)

if contains_biglake:
sql_query, _, _ = value._to_sql_query(include_index=False)
return f"({sql_query})"

return _table_to_sql(value._to_placeholder_table(dry_run=dry_run))

if isinstance(value, str):
Expand Down
10 changes: 7 additions & 3 deletions bigframes/session/_io/bigquery/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -519,9 +519,13 @@ def to_query(
time_travel_timestamp: Optional[datetime.datetime] = None,
) -> str:
"""Compile query_or_table with conditions(filters, wildcards) to query."""
sub_query = (
f"({query_or_table})" if is_query(query_or_table) else f"`{query_or_table}`"
)
if is_query(query_or_table):
sub_query = f"({query_or_table})"
else:
# Table ID can have 1, 2, 3, or 4 parts. Quoting all parts to be safe.
# See: https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#identifiers
parts = query_or_table.split(".")
sub_query = ".".join(f"`{part}`" for part in parts)

# TODO(b/338111344): Generate an index based on DefaultIndexKind if we
# don't have index columns specified.
Expand Down
57 changes: 57 additions & 0 deletions tests/unit/core/test_pyformat.py
Original file line number Diff line number Diff line change
Expand Up @@ -500,6 +500,15 @@ def test_pyformat_with_query_string_replaces_variables(session):
),
"SELECT * FROM `ListedProject`.`ListedDataset`.`ListedTable`",
),
(
google.cloud.bigquery.TableReference(
google.cloud.bigquery.DatasetReference(
"my-project", "my-catalog.my-namespace"
),
"my-table",
),
"SELECT * FROM `my-project`.`my-catalog`.`my-namespace`.`my-table`",
),
),
)
def test_pyformat_with_table_replaces_variables(table, expected_sql, session=session):
Expand All @@ -511,3 +520,51 @@ def test_pyformat_with_table_replaces_variables(table, expected_sql, session=ses
sql = "SELECT * FROM {table}"
got_sql = pyformat.pyformat(sql, pyformat_args=pyformat_args, session=session)
assert got_sql == expected_sql


def test_pyformat_with_bigframes_dataframe_biglake_table(session):
# Create a real BigFrames DataFrame that points to a BigLake table.
import bigframes.core.array_value as array_value
import bigframes.core.blocks as blocks
import bigframes.core.bq_data as bq_data
import bigframes.dataframe

# Define the BigLake table
project_id = "my-project"
catalog_id = "my-catalog"
namespace_id = "my-namespace"
table_id = "my-table"
schema = (google.cloud.bigquery.SchemaField("col", "INTEGER"),)

biglake_table = bq_data.BiglakeIcebergTable(
project_id=project_id,
catalog_id=catalog_id,
namespace_id=namespace_id,
table_id=table_id,
physical_schema=schema,
cluster_cols=(),
metadata=bq_data.TableMetadata(
location=bq_data.BigQueryRegion("us-central1"),
type="TABLE",
),
)

# ArrayValue.from_table is what read_gbq uses.
av = array_value.ArrayValue.from_table(biglake_table, session)
block = blocks.Block(av, index_columns=[], column_labels=["col"])
df = bigframes.dataframe.DataFrame(block)

pyformat_args = {"df": df}
sql = "SELECT * FROM {df}"

got_sql = pyformat.pyformat(sql, pyformat_args=pyformat_args, session=session)

# For BigLake, we now expect a SUBQUERY, not a view reference.
# The subquery should have correctly quoted 4-part ID.
assert "SELECT" in got_sql
assert project_id in got_sql
assert catalog_id in got_sql
assert namespace_id in got_sql
assert table_id in got_sql
assert got_sql.startswith("SELECT * FROM (SELECT")
assert got_sql.endswith(")")
Loading