googleapis · tswast · Mar 5, 2026 · Jan 21, 2026 · Mar 3, 2026 · Mar 4, 2026
@@ -14,15 +14,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# Generated by synthtool. DO NOT EDIT!
 [run]
 branch = True
 omit =
   google/__init__.py
   google/cloud/__init__.py
 
 [report]
-fail_under = 96
+fail_under = 95
 show_missing = True
 exclude_lines =
     # Re-enable the standard pragma
@@ -34,6 +33,5 @@ exclude_lines =
 omit =
   */gapic/*.py
   */proto/*.py
-  */core/*.py
   */site-packages/*.py
   google/cloud/__init__.py
@@ -58,4 +58,4 @@ jobs:
       run: |
         find .coverage-results -type f -name '*.zip' -exec unzip {} \;
         coverage combine .coverage-results/**/.coverage*
-        coverage report --show-missing --fail-under=96
+        coverage report --show-missing --fail-under=95
@@ -51,6 +51,7 @@ docs.metadata
 # Virtual environment
 env/
 venv/
+.venv/
 
 # Test logs
 coverage.xml

@@ -405,7 +405,7 @@ def cover(session):
     test runs (not system test runs), and then erases coverage data.
     """
     session.install("coverage", "pytest-cov")
-    session.run("coverage", "report", "--show-missing", "--fail-under=96")
+    session.run("coverage", "report", "--show-missing", "--fail-under=95")
 
     # Make sure there is no dead code in our test directories.
     session.run(

@@ -0,0 +1,81 @@
+# Copyright (c) 2026 pandas-gbq Authors All rights reserved.
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+"""
+Utilities for working with BigLake tables.
+"""
+
+# TODO(tswast): Synchronize with bigframes/session/iceberg.py, which uses
+# pyiceberg and the BigLake APIs, rather than relying on dry run.
+
+from __future__ import annotations
+
+import dataclasses
+from typing import Sequence
+
+import google.cloud.bigquery
+
+import pandas_gbq.core.resource_references
+
+
+_DRY_RUN_TEMPLATE = """
+SELECT *
+FROM `{project}.{catalog}.{namespace}.{table}`
+"""
+
+
+_COUNT_TEMPLATE = """
+SELECT COUNT(*) as total_rows
+FROM `{project}.{catalog}.{namespace}.{table}`
+"""
+
+
+@dataclasses.dataclass(frozen=True)
+class BigLakeTableMetadata:
+    schema: Sequence[google.cloud.bigquery.SchemaField]
+    num_rows: int
+
+
+def get_table_metadata(
+    *,
+    reference: pandas_gbq.core.resource_references.BigLakeTableId,
+    bqclient: google.cloud.bigquery.Client,
+) -> BigLakeTableMetadata:
+    """
+    Get the schema for a BigLake table.
+
+    Currently, this does some BigQuery queries. In the future, we'll want to get
+    other metadata like the number of rows and storage bytes so that we can do a
+    more accurate estimate of how many rows to sample.
+    """
+    dry_run_config = google.cloud.bigquery.QueryJobConfig(dry_run=True)
+    query = _DRY_RUN_TEMPLATE.format(
+        project=reference.project,
+        catalog=reference.catalog,
+        namespace=".".join(reference.namespace),
+        table=reference.table,
+    )
+    job = bqclient.query(query, job_config=dry_run_config)
+    job.result()
+    schema = job.schema
+
+    count_rows = list(
+        bqclient.query_and_wait(
+            _COUNT_TEMPLATE.format(
+                project=reference.project,
+                catalog=reference.catalog,
+                namespace=".".join(reference.namespace),
+                table=reference.table,
+            )
+        )
+    )
+    assert (
+        len(count_rows) == 1
+    ), "got unexpected query response when determining number of rows"
+    total_rows = count_rows[0].total_rows
+
+    return BigLakeTableMetadata(
+        schema=schema if schema is not None else [],
+        num_rows=total_rows,
+    )
@@ -0,0 +1,67 @@
+# Copyright (c) 2026 pandas-gbq Authors All rights reserved.
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+import dataclasses
+import re
+from typing import Union
+
+
+_TABLE_REFEREENCE_PATTERN = re.compile(
+    # In the past, organizations could prefix their project IDs with a domain
+    # name. Such projects still exist, especially at Google.
+    r"^(?P<legacy_project_domain>[^:]+:)?"
+    r"(?P<project>[^.]+)\."
+    # Match dataset or catalog + namespace.
+    #
+    # Namespace could be arbitrarily deeply nested in Iceberg/BigLake. Support
+    # this without catastrophic backtracking by moving the trailing "." to the
+    # table group.
+    r"(?P<inner_parts>.*)"
+    # Table names can't contain ".", as that's used as the separator.
+    r"\.(?P<table>[^.]+)$"
+)
+
+
+@dataclasses.dataclass(frozen=True)
+class BigLakeTableId:
+    project: str
+    catalog: str
+    namespace: tuple[str, ...]
+    table: str
+
+
+@dataclasses.dataclass(frozen=True)
+class BigQueryTableId:
+    project_id: str
+    dataset_id: str
+    table_id: str
+
+
+def parse_table_id(table_id: str) -> Union[BigLakeTableId, BigQueryTableId]:
+    """Turn a string into a BigLakeTableId or BigQueryTableId.
+
+    Raises:
+        ValueError: If the table ID is invalid.
+    """
+    regex_match = _TABLE_REFEREENCE_PATTERN.match(table_id)
+    if not regex_match:
+        raise ValueError(f"Invalid table ID: {table_id}")
+
+    inner_parts = regex_match.group("inner_parts").split(".")
+    if any(part == "" for part in inner_parts):
+        raise ValueError(f"Invalid table ID: {table_id}")
+
+    if len(inner_parts) == 1:
+        return BigQueryTableId(
+            project_id=regex_match.group("project"),
+            dataset_id=inner_parts[0],
+            table_id=regex_match.group("table"),
+        )
+
+    return BigLakeTableId(
+        project=regex_match.group("project"),
+        catalog=inner_parts[0],
+        namespace=tuple(inner_parts[1:]),
+        table=regex_match.group("table"),
+    )
-Original file line number
+Diff line change
@@ Expand Up / @@ -51,6 +51,7 @@ docs.metadata @@
     # Virtual environment
     env/
     venv/
+    .venv/
     # Test logs
     coverage.xml
@@ Expand Down @@