Skip to content
This repository was archived by the owner on Mar 13, 2026. It is now read-only.
Merged
4 changes: 1 addition & 3 deletions .coveragerc
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,14 @@
# See the License for the specific language governing permissions and
# limitations under the License.

# Generated by synthtool. DO NOT EDIT!
[run]
branch = True
omit =
google/__init__.py
google/cloud/__init__.py

[report]
fail_under = 96
fail_under = 95
show_missing = True
exclude_lines =
# Re-enable the standard pragma
Expand All @@ -34,6 +33,5 @@ exclude_lines =
omit =
*/gapic/*.py
*/proto/*.py
*/core/*.py
*/site-packages/*.py
google/cloud/__init__.py
2 changes: 1 addition & 1 deletion .github/workflows/unittest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -58,4 +58,4 @@ jobs:
run: |
find .coverage-results -type f -name '*.zip' -exec unzip {} \;
coverage combine .coverage-results/**/.coverage*
coverage report --show-missing --fail-under=96
coverage report --show-missing --fail-under=95
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ docs.metadata
# Virtual environment
env/
venv/
.venv/

# Test logs
coverage.xml
Expand Down
2 changes: 1 addition & 1 deletion noxfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -405,7 +405,7 @@ def cover(session):
test runs (not system test runs), and then erases coverage data.
"""
session.install("coverage", "pytest-cov")
session.run("coverage", "report", "--show-missing", "--fail-under=96")
session.run("coverage", "report", "--show-missing", "--fail-under=95")

# Make sure there is no dead code in our test directories.
session.run(
Expand Down
81 changes: 81 additions & 0 deletions pandas_gbq/core/biglake.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
# Copyright (c) 2026 pandas-gbq Authors All rights reserved.
# Use of this source code is governed by a BSD-style
# license that can be found in the LICENSE file.

"""
Utilities for working with BigLake tables.
"""

# TODO(tswast): Synchronize with bigframes/session/iceberg.py, which uses
# pyiceberg and the BigLake APIs, rather than relying on dry run.

from __future__ import annotations

import dataclasses
from typing import Sequence

import google.cloud.bigquery

import pandas_gbq.core.resource_references


_DRY_RUN_TEMPLATE = """
SELECT *
FROM `{project}.{catalog}.{namespace}.{table}`
"""


_COUNT_TEMPLATE = """
SELECT COUNT(*) as total_rows
FROM `{project}.{catalog}.{namespace}.{table}`
"""


@dataclasses.dataclass(frozen=True)
class BigLakeTableMetadata:
schema: Sequence[google.cloud.bigquery.SchemaField]
num_rows: int


def get_table_metadata(
*,
reference: pandas_gbq.core.resource_references.BigLakeTableId,
bqclient: google.cloud.bigquery.Client,
) -> BigLakeTableMetadata:
"""
Get the schema for a BigLake table.

Currently, this does some BigQuery queries. In the future, we'll want to get
other metadata like the number of rows and storage bytes so that we can do a
more accurate estimate of how many rows to sample.
"""
dry_run_config = google.cloud.bigquery.QueryJobConfig(dry_run=True)
query = _DRY_RUN_TEMPLATE.format(
project=reference.project,
catalog=reference.catalog,
namespace=".".join(reference.namespace),
table=reference.table,
)
job = bqclient.query(query, job_config=dry_run_config)
job.result()
schema = job.schema

count_rows = list(
bqclient.query_and_wait(
_COUNT_TEMPLATE.format(
project=reference.project,
catalog=reference.catalog,
namespace=".".join(reference.namespace),
table=reference.table,
)
)
)
assert (
len(count_rows) == 1
), "got unexpected query response when determining number of rows"
total_rows = count_rows[0].total_rows

return BigLakeTableMetadata(
schema=schema if schema is not None else [],
num_rows=total_rows,
)
67 changes: 67 additions & 0 deletions pandas_gbq/core/resource_references.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
# Copyright (c) 2026 pandas-gbq Authors All rights reserved.
# Use of this source code is governed by a BSD-style
# license that can be found in the LICENSE file.

import dataclasses
import re
from typing import Union


_TABLE_REFEREENCE_PATTERN = re.compile(
# In the past, organizations could prefix their project IDs with a domain
# name. Such projects still exist, especially at Google.
r"^(?P<legacy_project_domain>[^:]+:)?"
r"(?P<project>[^.]+)\."
# Match dataset or catalog + namespace.
#
# Namespace could be arbitrarily deeply nested in Iceberg/BigLake. Support
# this without catastrophic backtracking by moving the trailing "." to the
# table group.
r"(?P<inner_parts>.*)"
# Table names can't contain ".", as that's used as the separator.
r"\.(?P<table>[^.]+)$"
)


@dataclasses.dataclass(frozen=True)
class BigLakeTableId:
project: str
catalog: str
namespace: tuple[str, ...]
table: str


@dataclasses.dataclass(frozen=True)
class BigQueryTableId:
project_id: str
dataset_id: str
table_id: str


def parse_table_id(table_id: str) -> Union[BigLakeTableId, BigQueryTableId]:
"""Turn a string into a BigLakeTableId or BigQueryTableId.

Raises:
ValueError: If the table ID is invalid.
"""
regex_match = _TABLE_REFEREENCE_PATTERN.match(table_id)
if not regex_match:
raise ValueError(f"Invalid table ID: {table_id}")

inner_parts = regex_match.group("inner_parts").split(".")
if any(part == "" for part in inner_parts):
raise ValueError(f"Invalid table ID: {table_id}")

if len(inner_parts) == 1:
return BigQueryTableId(
project_id=regex_match.group("project"),
dataset_id=inner_parts[0],
table_id=regex_match.group("table"),
)

return BigLakeTableId(
project=regex_match.group("project"),
catalog=inner_parts[0],
namespace=tuple(inner_parts[1:]),
table=regex_match.group("table"),
)
Loading
Loading