1- #!/usr/bin/env python
21"""This module contains functionality for persisting/serialising DataFrames.
32
43Available constants
1312:meta public:
1413"""
1514
15+ import pdb
1616import warnings
17+ from duckdb import arrow
18+ import pyarrow as pa
19+ import pyarrow .compute as pc
1720from collections .abc import Iterable
1821from typing import TYPE_CHECKING , TypeVar
1922
@@ -265,6 +268,61 @@ def columns_in_predicates(predicates: PredicatesType) -> set[str]:
265268 return columns
266269
267270
271+ def _filter_df_or_table_from_predicates (
272+ df_or_table : pd .DataFrame | pa .Table ,
273+ predicates : PredicatesType | None ,
274+ strict_date_types : bool = False ,
275+ arrow_mode : bool = False ,
276+ ) -> pd .DataFrame | pa .Table :
277+ if predicates is None :
278+ return df_or_table
279+ indexer : npt .NDArray [np .bool_ ] = np .zeros (len (df_or_table ), dtype = bool )
280+ for conjunction in predicates :
281+ inner_indexer : npt .NDArray [np .bool_ ] = np .ones (len (df_or_table ), dtype = bool )
282+ for column , op , value in conjunction :
283+ column_name = ensure_unicode_string_type (column )
284+ values = (
285+ df_or_table .column (column_name ).to_numpy ()
286+ if arrow_mode
287+ else df_or_table [column_name ].values
288+ )
289+ filter_array_like (
290+ values ,
291+ op ,
292+ value ,
293+ inner_indexer ,
294+ inner_indexer ,
295+ strict_date_types = strict_date_types ,
296+ column_name = column_name ,
297+ )
298+ indexer = inner_indexer | indexer
299+
300+ if not arrow_mode :
301+ return df_or_table [indexer ]
302+
303+ table_mask = pa .array (indexer , type = pa .bool_ ())
304+ return df_or_table .filter (table_mask )
305+
306+
307+ # Casting pyarrow structures to numpy ones might introduce some overhead
308+ # but we do not have to maintain twice the logic for filtering from predicates
309+ def filter_table_from_predicates (table : pa .Table , predicates : PredicatesType ):
310+ """Filter a `pyarrow.Table` based on predicates in disjunctive normal
311+ form.
312+
313+ See Also
314+ --------
315+ * :ref:`predicate_pushdown`
316+ * :ref:`filter_df_from_predicates`
317+ """
318+ return _filter_df_or_table_from_predicates (
319+ df_or_table = table ,
320+ predicates = predicates ,
321+ strict_date_types = False ,
322+ arrow_mode = True ,
323+ )
324+
325+
268326def filter_df_from_predicates (
269327 df : pd .DataFrame ,
270328 predicates : PredicatesType | None ,
@@ -288,24 +346,12 @@ def filter_df_from_predicates(
288346 --------
289347 * :ref:`predicate_pushdown`
290348 """
291- if predicates is None :
292- return df
293- indexer : npt .NDArray [np .bool_ ] = np .zeros (len (df ), dtype = bool )
294- for conjunction in predicates :
295- inner_indexer : npt .NDArray [np .bool_ ] = np .ones (len (df ), dtype = bool )
296- for column , op , value in conjunction :
297- column_name = ensure_unicode_string_type (column )
298- filter_array_like (
299- df [column_name ].values ,
300- op ,
301- value ,
302- inner_indexer ,
303- inner_indexer ,
304- strict_date_types = strict_date_types ,
305- column_name = column_name ,
306- )
307- indexer = inner_indexer | indexer
308- return df [indexer ]
349+ return _filter_df_or_table_from_predicates (
350+ df_or_table = df ,
351+ predicates = predicates ,
352+ strict_date_types = strict_date_types ,
353+ arrow_mode = False ,
354+ )
309355
310356
311357def _handle_categorical_data (array_like , require_ordered ):
0 commit comments