Tracing node execution in cudf-polars

TomAugspurger · TomAugspurger · commit cad2f48b5c90 · 2025-09-05T07:14:49.000-07:00
This PR introduces a new *low-overhead* tracing tool for cudf-polars. When
enabled, we'll capture a record for each `IR.do_evaluate` node executed
while running the polars query.
diff --git a/python/cudf_polars/cudf_polars/callback.py b/python/cudf_polars/cudf_polars/callback.py
@@ -20,6 +20,7 @@
 
 import pylibcudf
 import rmm
+import rmm.statistics
 from rmm._cuda import gpu
 
 from cudf_polars.dsl.tracing import CUDF_POLARS_NVTX_DOMAIN
@@ -143,6 +144,7 @@ def set_memory_resource(
             ),
         )
     rmm.mr.set_current_device_resource(mr)
+    rmm.statistics.enable_statistics()
     try:
         yield mr
     finally:
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -33,7 +33,7 @@
 from cudf_polars.dsl.expressions.base import ExecutionContext
 from cudf_polars.dsl.nodebase import Node
 from cudf_polars.dsl.to_ast import to_ast, to_parquet_filter
-from cudf_polars.dsl.tracing import nvtx_annotate_cudf_polars
+from cudf_polars.dsl.tracing import log_do_evaluate, nvtx_annotate_cudf_polars
 from cudf_polars.dsl.utils.reshape import broadcast
 from cudf_polars.dsl.utils.windows import range_window_bounds
 from cudf_polars.utils import dtypes
@@ -439,6 +439,7 @@ def fast_count(self) -> int:  # pragma: no cover
 
     @classmethod
     @nvtx_annotate_cudf_polars(message="Scan")
+    @log_do_evaluate
     def do_evaluate(
         cls,
         schema: Schema,
@@ -929,6 +930,7 @@ def _write_parquet(
 
     @classmethod
     @nvtx_annotate_cudf_polars(message="Sink")
+    @log_do_evaluate
     def do_evaluate(
         cls,
         schema: Schema,
@@ -988,6 +990,7 @@ def is_equal(self, other: Self) -> bool:  # noqa: D102
 
     @classmethod
     @nvtx_annotate_cudf_polars(message="Cache")
+    @log_do_evaluate
     def do_evaluate(
         cls, key: int, refcount: int | None, df: DataFrame
     ) -> DataFrame:  # pragma: no cover; basic evaluation never calls this
@@ -1068,6 +1071,7 @@ def get_hashable(self) -> Hashable:
 
     @classmethod
     @nvtx_annotate_cudf_polars(message="DataFrameScan")
+    @log_do_evaluate
     def do_evaluate(
         cls,
         schema: Schema,
@@ -1127,6 +1131,7 @@ def _is_len_expr(exprs: tuple[expr.NamedExpr, ...]) -> bool:  # pragma: no cover
 
     @classmethod
     @nvtx_annotate_cudf_polars(message="Select")
+    @log_do_evaluate
     def do_evaluate(
         cls,
         exprs: tuple[expr.NamedExpr, ...],
@@ -1210,6 +1215,7 @@ def __init__(
 
     @classmethod
     @nvtx_annotate_cudf_polars(message="Reduce")
+    @log_do_evaluate
     def do_evaluate(
         cls,
         exprs: tuple[expr.NamedExpr, ...],
@@ -1306,6 +1312,7 @@ def __init__(
 
     @classmethod
     @nvtx_annotate_cudf_polars(message="Rolling")
+    @log_do_evaluate
     def do_evaluate(
         cls,
         index: expr.NamedExpr,
@@ -1430,6 +1437,7 @@ def __init__(
 
     @classmethod
     @nvtx_annotate_cudf_polars(message="GroupBy")
+    @log_do_evaluate
     def do_evaluate(
         cls,
         schema: Schema,
@@ -1594,6 +1602,7 @@ def __init__(
 
     @classmethod
     @nvtx_annotate_cudf_polars(message="ConditionalJoin")
+    @log_do_evaluate
     def do_evaluate(
         cls,
         predicate_wrapper: Predicate,
@@ -1805,6 +1814,7 @@ def _build_columns(
 
     @classmethod
     @nvtx_annotate_cudf_polars(message="Join")
+    @log_do_evaluate
     def do_evaluate(
         cls,
         left_on_exprs: Sequence[expr.NamedExpr],
@@ -1950,6 +1960,7 @@ def __init__(
 
     @classmethod
     @nvtx_annotate_cudf_polars(message="HStack")
+    @log_do_evaluate
     def do_evaluate(
         cls,
         exprs: Sequence[expr.NamedExpr],
@@ -2015,6 +2026,7 @@ def __init__(
 
     @classmethod
     @nvtx_annotate_cudf_polars(message="Distinct")
+    @log_do_evaluate
     def do_evaluate(
         cls,
         keep: plc.stream_compaction.DuplicateKeepOption,
@@ -2105,6 +2117,7 @@ def __init__(
 
     @classmethod
     @nvtx_annotate_cudf_polars(message="Sort")
+    @log_do_evaluate
     def do_evaluate(
         cls,
         by: Sequence[expr.NamedExpr],
@@ -2155,6 +2168,7 @@ def __init__(self, schema: Schema, offset: int, length: int | None, df: IR):
 
     @classmethod
     @nvtx_annotate_cudf_polars(message="Slice")
+    @log_do_evaluate
     def do_evaluate(cls, offset: int, length: int, df: DataFrame) -> DataFrame:
         """Evaluate and return a dataframe."""
         return df.slice((offset, length))
@@ -2176,6 +2190,7 @@ def __init__(self, schema: Schema, mask: expr.NamedExpr, df: IR):
 
     @classmethod
     @nvtx_annotate_cudf_polars(message="Filter")
+    @log_do_evaluate
     def do_evaluate(cls, mask_expr: expr.NamedExpr, df: DataFrame) -> DataFrame:
         """Evaluate and return a dataframe."""
         (mask,) = broadcast(mask_expr.evaluate(df), target_length=df.num_rows)
@@ -2195,6 +2210,7 @@ def __init__(self, schema: Schema, df: IR):
 
     @classmethod
     @nvtx_annotate_cudf_polars(message="Projection")
+    @log_do_evaluate
     def do_evaluate(cls, schema: Schema, df: DataFrame) -> DataFrame:
         """Evaluate and return a dataframe."""
         # This can reorder things.
@@ -2224,6 +2240,7 @@ def __init__(self, schema: Schema, key: str, left: IR, right: IR):
 
     @classmethod
     @nvtx_annotate_cudf_polars(message="MergeSorted")
+    @log_do_evaluate
     def do_evaluate(cls, key: str, *dfs: DataFrame) -> DataFrame:
         """Evaluate and return a dataframe."""
         left, right = dfs
@@ -2344,6 +2361,7 @@ def get_hashable(self) -> Hashable:
 
     @classmethod
     @nvtx_annotate_cudf_polars(message="MapFunction")
+    @log_do_evaluate
     def do_evaluate(
         cls, schema: Schema, name: str, options: Any, df: DataFrame
     ) -> DataFrame:
@@ -2444,6 +2462,7 @@ def __init__(self, schema: Schema, zlice: Zlice | None, *children: IR):
 
     @classmethod
     @nvtx_annotate_cudf_polars(message="Union")
+    @log_do_evaluate
     def do_evaluate(cls, zlice: Zlice | None, *dfs: DataFrame) -> DataFrame:
         """Evaluate and return a dataframe."""
         # TODO: only evaluate what we need if we have a slice?
@@ -2501,6 +2520,7 @@ def _extend_with_nulls(table: plc.Table, *, nrows: int) -> plc.Table:
 
     @classmethod
     @nvtx_annotate_cudf_polars(message="HConcat")
+    @log_do_evaluate
     def do_evaluate(
         cls,
         should_broadcast: bool,  # noqa: FBT001
@@ -2546,6 +2566,7 @@ def __init__(self, schema: Schema):
 
     @classmethod
     @nvtx_annotate_cudf_polars(message="Empty")
+    @log_do_evaluate
     def do_evaluate(cls, schema: Schema) -> DataFrame:  # pragma: no cover
         """Evaluate and return a dataframe."""
         return DataFrame(
diff --git a/python/cudf_polars/cudf_polars/dsl/tracing.py b/python/cudf_polars/cudf_polars/dsl/tracing.py
@@ -6,11 +6,145 @@
 from __future__ import annotations
 
 import functools
+import os
+import time
+from typing import TYPE_CHECKING, Any, Literal
 
 import nvtx
+from typing_extensions import ParamSpec
+
+import rmm
+import rmm.statistics
+
+import cudf_polars.containers
+
+try:
+    import structlog
+except ImportError:
+    HAS_STRUCTLOG = False
+else:
+    HAS_STRUCTLOG = True
+
+# Question: should this be toggleable at runtime?
+LOG_TRACES = HAS_STRUCTLOG and os.environ.get("CUDF_POLARS_LOG_TRACES", "0") in {
+    "1",
+    "true",
+    "y",
+    "yes",
+}
 
 CUDF_POLARS_NVTX_DOMAIN = "cudf_polars"
 
 nvtx_annotate_cudf_polars = functools.partial(
     nvtx.annotate, domain=CUDF_POLARS_NVTX_DOMAIN
 )
+
+if TYPE_CHECKING:
+    from collections.abc import Callable
+
+    from cudf_polars.dsl import ir
+
+
+def make_snaphot(
+    node_type: type[ir.IR],
+    frames: list[cudf_polars.containers.DataFrame],
+    extra: dict[str, Any] | None = None,
+    phase: Literal["input", "output"] = "input",
+) -> dict:
+    """
+    Log the evaluation of an IR node.
+
+    Parameters
+    ----------
+    node_type
+        The type of the IR node.
+    frames
+        The frames being evaluated.
+    extra
+        Extra information to log.
+    phase
+        The phase of the evaluation. Either "input" or "output".
+    """
+    ir_name = node_type.__name__
+
+    d = {
+        "type": ir_name,
+        f"count_frames_{phase}": len(frames),
+        f"frames_{phase}": [
+            {
+                "shape": frame.table.shape(),
+                "size": sum(col.device_buffer_size() for col in frame.table.columns()),
+            }
+            for frame in frames
+        ],
+    }
+    d[f"total_bytes_{phase}"] = sum(x["size"] for x in d[f"frames_{phase}"])  # type: ignore[attr-defined]
+
+    stats = rmm.statistics.get_statistics()
+    if stats:
+        d.update(
+            {
+                f"rmm_current_bytes_{phase}": stats.current_bytes,
+                f"rmm_current_count_{phase}": stats.current_count,
+                f"rmm_peak_bytes_{phase}": stats.peak_bytes,
+                f"rmm_peak_count_{phase}": stats.peak_count,
+                f"rmm_total_bytes_{phase}": stats.total_bytes,
+                f"rmm_total_count_{phase}": stats.total_count,
+            }
+        )
+
+    if extra:
+        d.update(extra)
+
+    # log.info("Execute IR", **d)
+    return d
+
+
+P = ParamSpec("P")
+
+
+def log_do_evaluate(
+    func: Callable[P, cudf_polars.containers.DataFrame],
+) -> Callable[P, cudf_polars.containers.DataFrame]:
+    """
+    Decorator for an ``IR.do_evaluate`` method that logs information before and after evaluation.
+
+    Parameters
+    ----------
+    func
+        The ``IR.do_evaluate`` method to wrap.
+    """
+
+    @functools.wraps(func)
+    def wrapper(
+        cls: type[ir.IR], *args: Any, **kwargs: Any
+    ) -> cudf_polars.containers.DataFrame:
+        if LOG_TRACES:
+            log = structlog.get_logger()
+            frames = [
+                arg
+                for arg in list(args) + list(kwargs.values())
+                if isinstance(arg, cudf_polars.containers.DataFrame)
+            ]
+
+            before = make_snaphot(cls, frames, phase="input")
+
+            # TODO: fix these types! Want some way to say
+            #     Callable[ir.IR, *P.args, **P.kwargs], cudf_polars.containers.DataFrame]
+            # i.e. the first arg is an IR, it returns a DataFrame, and does
+            # whatever for the remaining args/kwargs.
+            start = time.monotonic_ns()
+            result = func(cls, *args, **kwargs)  # type: ignore
+            stop = time.monotonic_ns()
+
+            after = make_snaphot(
+                cls, [result], phase="output", extra={"start": start, "stop": stop}
+            )
+            record = before | after
+            log.info("Execute IR", **record)
+
+            return result
+        else:
+            return func(cls, *args, **kwargs)  # type: ignore
+
+    return wrapper  # type: ignore
diff --git a/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds.py b/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds.py
@@ -124,7 +124,7 @@ def run_duckdb(benchmark: Any, options: Sequence[str] | None = None) -> None:
             result = execute_duckdb_query(duckdb_query, run_config.dataset_path)
 
             t1 = time.time()
-            record = Record(query=q_id, duration=t1 - t0)
+            record = Record(query=q_id, iteration=i, duration=t1 - t0)
             if args.print_results:
                 print(result)
 
diff --git a/python/cudf_polars/cudf_polars/experimental/benchmarks/utils.py b/python/cudf_polars/cudf_polars/experimental/benchmarks/utils.py
diff --git a/python/cudf_polars/cudf_polars/experimental/shuffle.py b/python/cudf_polars/cudf_polars/experimental/shuffle.py