diff --git a/bigframes/core/pyformat.py b/bigframes/core/pyformat.py index 1dbb74fbb7..6cd57dd452 100644 --- a/bigframes/core/pyformat.py +++ b/bigframes/core/pyformat.py @@ -21,7 +21,7 @@ import string import typing -from typing import Any, Optional, Union +from typing import Any, Optional, Tuple, Union import google.cloud.bigquery import pandas @@ -39,7 +39,11 @@ def _table_to_sql(table: _BQ_TABLE_TYPES) -> str: - return f"`{table.project}`.`{table.dataset_id}`.`{table.table_id}`" + # BiglakeIcebergTable IDs have 4 parts. BigFrames packs catalog.namespace + # into the dataset_id. + dataset_parts = table.dataset_id.split(".") + dataset_sql = ".".join(f"`{part}`" for part in dataset_parts) + return f"`{table.project}`.{dataset_sql}.`{table.table_id}`" def _pandas_df_to_sql_dry_run(pd_df: pandas.DataFrame) -> str: @@ -102,6 +106,24 @@ def _field_to_template_value( return _pandas_df_to_sql(value, session=session, dry_run=dry_run, name=name) if isinstance(value, bigframes.dataframe.DataFrame): + import bigframes.core.bq_data as bq_data + import bigframes.core.nodes as nodes + + # TODO(b/493608478): Remove this workaround for BigLake/Iceberg tables, + # which cannot currently be used in views, once a fix rolls out. + def is_biglake( + node: nodes.BigFrameNode, child_results: Tuple[bool, ...] + ) -> bool: + if isinstance(node, nodes.ReadTableNode): + return isinstance(node.source.table, bq_data.BiglakeIcebergTable) + return any(child_results) + + contains_biglake = value._block.expr.node.reduce_up(is_biglake) + + if contains_biglake: + sql_query, _, _ = value._to_sql_query(include_index=False) + return f"({sql_query})" + return _table_to_sql(value._to_placeholder_table(dry_run=dry_run)) if isinstance(value, str): diff --git a/bigframes/session/_io/bigquery/__init__.py b/bigframes/session/_io/bigquery/__init__.py index 61b22d0311..09cce2b0c4 100644 --- a/bigframes/session/_io/bigquery/__init__.py +++ b/bigframes/session/_io/bigquery/__init__.py @@ -519,9 +519,13 @@ def to_query( time_travel_timestamp: Optional[datetime.datetime] = None, ) -> str: """Compile query_or_table with conditions(filters, wildcards) to query.""" - sub_query = ( - f"({query_or_table})" if is_query(query_or_table) else f"`{query_or_table}`" - ) + if is_query(query_or_table): + sub_query = f"({query_or_table})" + else: + # Table ID can have 1, 2, 3, or 4 parts. Quoting all parts to be safe. + # See: https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#identifiers + parts = query_or_table.split(".") + sub_query = ".".join(f"`{part}`" for part in parts) # TODO(b/338111344): Generate an index based on DefaultIndexKind if we # don't have index columns specified. diff --git a/tests/unit/core/test_pyformat.py b/tests/unit/core/test_pyformat.py index db7cedba8f..be7f52f4d5 100644 --- a/tests/unit/core/test_pyformat.py +++ b/tests/unit/core/test_pyformat.py @@ -500,6 +500,15 @@ def test_pyformat_with_query_string_replaces_variables(session): ), "SELECT * FROM `ListedProject`.`ListedDataset`.`ListedTable`", ), + ( + google.cloud.bigquery.TableReference( + google.cloud.bigquery.DatasetReference( + "my-project", "my-catalog.my-namespace" + ), + "my-table", + ), + "SELECT * FROM `my-project`.`my-catalog`.`my-namespace`.`my-table`", + ), ), ) def test_pyformat_with_table_replaces_variables(table, expected_sql, session=session): @@ -511,3 +520,51 @@ def test_pyformat_with_table_replaces_variables(table, expected_sql, session=ses sql = "SELECT * FROM {table}" got_sql = pyformat.pyformat(sql, pyformat_args=pyformat_args, session=session) assert got_sql == expected_sql + + +def test_pyformat_with_bigframes_dataframe_biglake_table(session): + # Create a real BigFrames DataFrame that points to a BigLake table. + import bigframes.core.array_value as array_value + import bigframes.core.blocks as blocks + import bigframes.core.bq_data as bq_data + import bigframes.dataframe + + # Define the BigLake table + project_id = "my-project" + catalog_id = "my-catalog" + namespace_id = "my-namespace" + table_id = "my-table" + schema = (google.cloud.bigquery.SchemaField("col", "INTEGER"),) + + biglake_table = bq_data.BiglakeIcebergTable( + project_id=project_id, + catalog_id=catalog_id, + namespace_id=namespace_id, + table_id=table_id, + physical_schema=schema, + cluster_cols=(), + metadata=bq_data.TableMetadata( + location=bq_data.BigQueryRegion("us-central1"), + type="TABLE", + ), + ) + + # ArrayValue.from_table is what read_gbq uses. + av = array_value.ArrayValue.from_table(biglake_table, session) + block = blocks.Block(av, index_columns=[], column_labels=["col"]) + df = bigframes.dataframe.DataFrame(block) + + pyformat_args = {"df": df} + sql = "SELECT * FROM {df}" + + got_sql = pyformat.pyformat(sql, pyformat_args=pyformat_args, session=session) + + # For BigLake, we now expect a SUBQUERY, not a view reference. + # The subquery should have correctly quoted 4-part ID. + assert "SELECT" in got_sql + assert project_id in got_sql + assert catalog_id in got_sql + assert namespace_id in got_sql + assert table_id in got_sql + assert got_sql.startswith("SELECT * FROM (SELECT") + assert got_sql.endswith(")")