Add predicate filtering

thomasmarwitz · thomasmarwitz · commit 422bc4951a7e · 2025-04-11T10:54:34.000+02:00
diff --git a/plateau/core/common_metadata.py b/plateau/core/common_metadata.py
@@ -166,15 +166,11 @@ def gen_metadata(schema: SchemaWrapper) -> dict[str, Any]:
             {
                 "name": field.name,
                 "field_name": field.name,
-                # the following fields are NOT accessed when resorting the columns
-                # "pandas_type": str(field.type),  # optional
-                # "numpy_type": str(field.type),  # optional
-                # "metadata": field.metadata,  # optional: decode if needed
+                # other are NOT accessed when resorting the columns
             }
         )
 
     return pandas_metadata
-    # > {'columns': [{'name': 'A', 'field_name': 'A'}, {'name': 'B', 'field_name': 'B'}, {'name': 'C', 'field_name': 'C'}, {'name': 'D', 'field_name': 'D'}, {'name': 'E', 'field_name': 'E'}, {'name': 'F', 'field_name': 'F'}], 'index_columns': [], 'pandas_version': '2.2.3'}
 
 
 def normalize_column_order(schema, partition_keys=None):
@@ -284,10 +280,6 @@ def make_meta(obj, origin, partition_keys=None):
     elif isinstance(obj, pa.Table):
         return obj.schema
 
-    # normalize_column_order(
-    #         SchemaWrapper(obj.schema, origin), partition_keys=partition_keys
-    #     )
-
     if not isinstance(obj, pd.DataFrame):
         raise ValueError("Input must be a pyarrow schema, or a pandas dataframe")
 
diff --git a/plateau/io/duckdb/dataframe.py b/plateau/io/duckdb/dataframe.py
@@ -23,7 +23,7 @@
 def read_table_as_ddb(
     uuid: str,
     store: KeyValueStore,
-    table: str,
+    as_table: str,
     predicates: list[list[tuple[str, str, Any]]] | None = None,
     **kwargs,  # support for everything else
 ) -> duckdb.DuckDBPyConnection:
@@ -37,7 +37,7 @@ def read_table_as_ddb(
 
     table_obj = read_table_as_arrow(uuid, store=store, predicates=predicates, **kwargs)
     con = duckdb.connect()
-    con.register(table, table_obj)
+    con.register(as_table, table_obj)
     return con
 
 
diff --git a/plateau/io/duckdb/helper.py b/plateau/io/duckdb/helper.py
@@ -54,9 +54,7 @@ def align_categories(tables: list[pa.Table], categoricals: list[str]) -> list[pa
     if not categoricals:
         return tables
 
-    # Process each categorical column
     for column in categoricals:
-        all_types = [table[column].type for table in tables]
 
         union_values = set()
         baseline_categories = None
@@ -69,29 +67,22 @@ def align_categories(tables: list[pa.Table], categoricals: list[str]) -> list[pa
                 continue
 
             col = table[column]
-            # Ensure the column is dictionary encoded.
-            # if not pa.types.is_dictionary(col.type):
-            #     col = pc.dictionary_encode(col)
-
-            # Combine chunks to get a single Array (if needed)
             col_combined = (
                 col.combine_chunks() if isinstance(col, pa.ChunkedArray) else col
             )
-            # Extract the dictionary as a Python list.
             cats = col_combined.dictionary.to_pylist()
             union_values.update(cats)
             if table.num_rows > baseline_num_rows:
                 baseline_num_rows = table.num_rows
                 baseline_categories = cats
 
         if baseline_categories is None:
-            # No table contained this column.
             continue
 
         # Build the new dictionary order: use the baseline order then add any additional values
+        # stay consistent with the utils:align_categories function
         extra = union_values - set(baseline_categories)
         new_dictionary = baseline_categories + sorted(extra)
-        # Build a lookup map for quick conversion: value -> new index
         union_map = {val: idx for idx, val in enumerate(new_dictionary)}
 
         # Second pass: recast the column in every table to use the new dictionary
@@ -104,24 +95,20 @@ def align_categories(tables: list[pa.Table], categoricals: list[str]) -> list[pa
             col = table[column]
             if not pa.types.is_dictionary(col.type):
                 col = pc.dictionary_encode(col)
-            # Decode the column to its raw values (as a Python list)
             col_combined = (
                 col.combine_chunks() if isinstance(col, pa.ChunkedArray) else col
             )
             decoded = col_combined.to_pylist()
-            # Map each value to the new dictionary index (preserving nulls)
             new_indices = [
                 union_map[val] if val is not None else None for val in decoded
             ]
             new_indices_array = pa.array(new_indices, type=pa.int32())
-            # Create a new dictionary array with the new dictionary
             new_dict_array = pa.DictionaryArray.from_arrays(
                 new_indices_array, pa.array(new_dictionary, type=col.type.value_type)
             )
-            # Replace the column in the table
             col_index = table.schema.get_field_index(column)
             table = table.set_column(col_index, column, new_dict_array)
             new_tables.append(table)
-        tables = new_tables  # update tables for next categorical column
+        tables = new_tables
 
     return tables
diff --git a/plateau/io/iter.py b/plateau/io/iter.py
@@ -79,7 +79,7 @@ def read_dataset_as_metapartitions__iterator(
                 MetaPartition.concat_metapartitions_arrow
                 if arrow_mode
                 else MetaPartition.concat_metapartitions
-            )  # Dirty, refactor later
+            )
             mp = concatenate(
                 [
                     mp_inner.load_dataframes(
diff --git a/plateau/io_components/metapartition.py b/plateau/io_components/metapartition.py
@@ -256,7 +256,7 @@ def __init__(
         self.schema = schema
         self.table_name = table_name
         if data is not None and schema is None:
-            self.schema = make_meta(  # handles pa.Table as well
+            self.schema = make_meta(
                 data, origin=f"{table_name}/{label}", partition_keys=partition_keys
             )
 
@@ -691,9 +691,7 @@ def load_dataframes(
             predicate_pushdown_to_io=predicate_pushdown_to_io,
             predicates=filtered_predicates,
             date_as_object=dates_as_object,
-            **(
-                {"return_pyarrow_table": True} if arrow_mode else {}
-            ),  # dirty hack for now
+            **({"return_pyarrow_table": True} if arrow_mode else {}),
         )
         LOGGER.debug(
             "Loaded dataframe %s in %s seconds.", self.file, time.time() - start
@@ -736,7 +734,6 @@ def load_dataframes(
                         ", ".join(sorted(missing_cols))
                     )
                 )
-            # Really ugly, refactor later!
             if arrow_mode and list(df_or_arrow.column_names) != columns:
                 # Arrow tables are immutable, so we need to create a new table
                 df_or_arrow = df_or_arrow.select(columns)
@@ -811,7 +808,6 @@ def _reconstruct_index_columns_arrow(
                 )
 
             # Create an array filled with the repeated key value
-            # FIXME: remove pdb.set_trace()
             if categories and name in categories:
                 # Use dictionary type (categorical)
                 dictionary_array = pa.DictionaryArray.from_arrays(
@@ -824,8 +820,7 @@ def _reconstruct_index_columns_arrow(
 
             new_columns.append((name, arrow_value))
 
-        # Prepend new index columns
-        for name, array in reversed(new_columns):  # insert in reverse to maintain order
+        for name, array in reversed(new_columns):
             table = table.append_column(name, array)
 
         # move newly added column to front
@@ -1210,7 +1205,7 @@ def partition_on(self, partition_on: str | Sequence[str]):
             partition_on = [partition_on]
         partition_on = self._ensure_compatible_partitioning(partition_on)
 
-        new_data = self._partition_data(partition_on)  # WIP: needs arrow compatibility
+        new_data = self._partition_data(partition_on)
 
         for label, data in new_data.items():
             tmp_mp = MetaPartition(
@@ -1399,8 +1394,6 @@ def concat_metapartitions_arrow(
 
         new_table = pa.concat_tables(data)
 
-        # TODO: What about align_categories?
-
         new_schema = validate_compatible(schema)
 
         new_label = MetaPartition._merge_labels(metapartitions, label_merger)
diff --git a/plateau/io_components/utils.py b/plateau/io_components/utils.py
@@ -126,16 +126,18 @@ def _ensure_compatible_indices(
 def group_table_by_partition_keys(table: pa.Table, partition_on: list[str]):
     """Yield tuples of partition keys and pyarrow tables (excluding the partition_on columns) using polars.
 
-    Pyarrow's groupby is not really useful for this specific purpose, thus the detour through polars."""
+    Pyarrow's groupby is not really useful for this specific purpose, thus the detour through polars.
+    """
 
     df = pl.from_arrow(table)
 
     groups = df.group_by(partition_on, maintain_order=True)
 
     for key, group in groups:
-        arrow_table = group.drop(partition_on).to_arrow() # drop partition keys
+        arrow_table = group.drop(partition_on).to_arrow()  # drop partition keys
         yield key, arrow_table
 
+
 def validate_partition_keys(
     dataset_uuid,
     store,
diff --git a/plateau/serialization/_generic.py b/plateau/serialization/_generic.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python
 """This module contains functionality for persisting/serialising DataFrames.
 
 Available constants
@@ -13,7 +12,11 @@
 :meta public:
 """
 
+import pdb
 import warnings
+from duckdb import arrow
+import pyarrow as pa
+import pyarrow.compute as pc
 from collections.abc import Iterable
 from typing import TYPE_CHECKING, TypeVar
 
@@ -265,6 +268,61 @@ def columns_in_predicates(predicates: PredicatesType) -> set[str]:
     return columns
 
 
+def _filter_df_or_table_from_predicates(
+    df_or_table: pd.DataFrame | pa.Table,
+    predicates: PredicatesType | None,
+    strict_date_types: bool = False,
+    arrow_mode: bool = False,
+) -> pd.DataFrame | pa.Table:
+    if predicates is None:
+        return df_or_table
+    indexer: npt.NDArray[np.bool_] = np.zeros(len(df_or_table), dtype=bool)
+    for conjunction in predicates:
+        inner_indexer: npt.NDArray[np.bool_] = np.ones(len(df_or_table), dtype=bool)
+        for column, op, value in conjunction:
+            column_name = ensure_unicode_string_type(column)
+            values = (
+                df_or_table.column(column_name).to_numpy()
+                if arrow_mode
+                else df_or_table[column_name].values
+            )
+            filter_array_like(
+                values,
+                op,
+                value,
+                inner_indexer,
+                inner_indexer,
+                strict_date_types=strict_date_types,
+                column_name=column_name,
+            )
+        indexer = inner_indexer | indexer
+
+    if not arrow_mode:
+        return df_or_table[indexer]
+
+    table_mask = pa.array(indexer, type=pa.bool_())
+    return df_or_table.filter(table_mask)
+
+
+# Casting pyarrow structures to numpy ones might introduce some overhead
+# but we do not have to maintain twice the logic for filtering from predicates
+def filter_table_from_predicates(table: pa.Table, predicates: PredicatesType):
+    """Filter a `pyarrow.Table` based on predicates in disjunctive normal
+    form.
+
+    See Also
+    --------
+    * :ref:`predicate_pushdown`
+    * :ref:`filter_df_from_predicates`
+    """
+    return _filter_df_or_table_from_predicates(
+        df_or_table=table,
+        predicates=predicates,
+        strict_date_types=False,
+        arrow_mode=True,
+    )
+
+
 def filter_df_from_predicates(
     df: pd.DataFrame,
     predicates: PredicatesType | None,
@@ -288,24 +346,12 @@ def filter_df_from_predicates(
     --------
     * :ref:`predicate_pushdown`
     """
-    if predicates is None:
-        return df
-    indexer: npt.NDArray[np.bool_] = np.zeros(len(df), dtype=bool)
-    for conjunction in predicates:
-        inner_indexer: npt.NDArray[np.bool_] = np.ones(len(df), dtype=bool)
-        for column, op, value in conjunction:
-            column_name = ensure_unicode_string_type(column)
-            filter_array_like(
-                df[column_name].values,
-                op,
-                value,
-                inner_indexer,
-                inner_indexer,
-                strict_date_types=strict_date_types,
-                column_name=column_name,
-            )
-        indexer = inner_indexer | indexer
-    return df[indexer]
+    return _filter_df_or_table_from_predicates(
+        df_or_table=df,
+        predicates=predicates,
+        strict_date_types=strict_date_types,
+        arrow_mode=False,
+    )
 
 
 def _handle_categorical_data(array_like, require_ordered):
diff --git a/plateau/serialization/_parquet.py b/plateau/serialization/_parquet.py
@@ -20,6 +20,7 @@
     check_predicates,
     filter_df,
     filter_df_from_predicates,
+    filter_table_from_predicates,
 )
 from ._io_buffer import BlockBuffer
 from ._util import ensure_unicode_string_type, schema_metadata_bytes_to_object
@@ -47,7 +48,7 @@ def _empty_table_from_schema(parquet_file: ParquetFile) -> pa.Table:
     return schema.empty_table()
 
 
-def _reset_dictionary_columns(table: pa.Table, exclude=None):
+def _reset_dictionary_columns(table: pa.Table, exclude=None) -> pa.Table:
     """We need to ensure that the dtype is exactly as requested, see GH227."""
     if exclude is None:
         exclude = []
@@ -197,9 +198,7 @@ def _restore_dataframe(
         # otherwise full read en block is the better option.
         if (not predicate_pushdown_to_io) or (columns is None and predicates is None):
             with pa.BufferReader(store.get(key)) as reader:
-                table = pq.read_pandas(
-                    reader, columns=columns
-                )  # TODO: is this relevant?
+                table = pq.read_pandas(reader, columns=columns)
         else:
             if HAVE_BOTO and isinstance(store, BotoStore):
                 # Parquet and seeks on S3 currently leak connections thus
@@ -281,7 +280,20 @@ def _restore_dataframe(
             table = table.cast(schema_metadata_bytes_to_object(table.schema))
 
         if return_pyarrow_table:
-            return table
+            table.rename_columns(
+                [ensure_unicode_string_type(name) for name in table.column_names]
+            )
+
+            if filter_query:
+                raise ValueError(
+                    "filter_query is not supported when 'return_pyarrow_table' is True (if you use arrow_mode)."
+                    "Hint: please express your filter query as predicates."
+                )
+
+            if predicates:
+                table = filter_table_from_predicates(table, predicates)
+
+            return table if columns is None else table.select(columns)
 
         _coerce = {"coerce_temporal_nanoseconds": True}
         df = table.to_pandas(date_as_object=date_as_object, **_coerce)
diff --git a/tests/io/duckdb/test_e2e.py b/tests/io/duckdb/test_e2e.py
diff --git a/tests/io/duckdb/test_helper.py b/tests/io/duckdb/test_helper.py

Original file line number	Diff line number	Diff line change
`@@ -79,7 +79,7 @@ def read_dataset_as_metapartitions__iterator(`
`79`	`79`	`MetaPartition.concat_metapartitions_arrow`
`80`	`80`	`if arrow_mode`
`81`	`81`	`else MetaPartition.concat_metapartitions`
`82`		`- ) # Dirty, refactor later`
	`82`	`+ )`
`83`	`83`	`mp = concatenate(`
`84`	`84`	`[`
`85`	`85`	`mp_inner.load_dataframes(`