feast-dev · ntkathole · Dec 8, 2025 · Nov 11, 2025 · Nov 11, 2025 · Nov 12, 2025
@@ -1,6 +1,6 @@
 import os
 import uuid
-from datetime import datetime, timezone
+from datetime import datetime, timedelta, timezone
 from pathlib import Path
 from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Union
 
@@ -133,21 +133,53 @@ def get_historical_features(
         config: RepoConfig,
         feature_views: List[FeatureView],
         feature_refs: List[str],
-        entity_df: Union[pd.DataFrame, str],
+        entity_df: Optional[Union[pd.DataFrame, dd.DataFrame, str]],
         registry: BaseRegistry,
         project: str,
         full_feature_names: bool = False,
+        **kwargs,
     ) -> RetrievalJob:
         assert isinstance(config.offline_store, DaskOfflineStoreConfig)
         for fv in feature_views:
             assert isinstance(fv.batch_source, FileSource)
 
-        if not isinstance(entity_df, pd.DataFrame) and not isinstance(
-            entity_df, dd.DataFrame
-        ):
-            raise ValueError(
-                f"Please provide an entity_df of type {type(pd.DataFrame)} instead of type {type(entity_df)}"
+        # Allow non-entity mode using start/end timestamps to enable bounded retrievals without an input entity_df.
+        # This synthesizes a minimal entity_df solely to drive the existing join and metadata plumbing without
+        # incurring source scans here; actual pushdowns can be layered in follow-ups if needed.
+        start_date: Optional[datetime] = kwargs.get("start_date", None)
+        end_date: Optional[datetime] = kwargs.get("end_date", None)
+        non_entity_mode = entity_df is None
+
+        if non_entity_mode:
+            # Default end_date to current time (UTC) to keep behavior predictable without extra parameters.
+            end_date = end_date or datetime.now(timezone.utc)
-            end_date = end_date or datetime.now(timezone.utc)
+            end_date = make_tzaware(end_date) or datetime.now(timezone.utc)
-            end_date = end_date or datetime.now(timezone.utc)
+            end_date = make_tzaware(end_date) or datetime.now(timezone.utc)
+
+            # When start_date is not provided, choose a conservative lower bound using max TTL, otherwise fall back.
+            if start_date is None:
+                max_ttl_seconds = 0
+                for fv in feature_views:
+                    if fv.ttl and isinstance(fv.ttl, timedelta):
+                        max_ttl_seconds = max(
+                            max_ttl_seconds, int(fv.ttl.total_seconds())
+                        )
+                if max_ttl_seconds > 0:
+                    start_date = end_date - timedelta(seconds=max_ttl_seconds)
+                else:
+                    # Keep default window bounded to avoid unbounded scans by default.
+                    start_date = end_date - timedelta(days=30)
+
-            # When start_date is not provided, choose a conservative lower bound using max TTL, otherwise fall back.
-            if start_date is None:
-                max_ttl_seconds = 0
-                for fv in feature_views:
-                    if fv.ttl and isinstance(fv.ttl, timedelta):
-                        max_ttl_seconds = max(
-                            max_ttl_seconds, int(fv.ttl.total_seconds())
-                        )
-                if max_ttl_seconds > 0:
-                    start_date = end_date - timedelta(seconds=max_ttl_seconds)
-                else:
-                    # Keep default window bounded to avoid unbounded scans by default.
-                    start_date = end_date - timedelta(days=30)
+            # Compute TTL-based lower bound for start_date.
+            max_ttl_seconds = 0
+            for fv in feature_views:
+                if fv.ttl and isinstance(fv.ttl, timedelta):
+                    max_ttl_seconds = max(
+                        max_ttl_seconds, int(fv.ttl.total_seconds())
+                    )
+            if max_ttl_seconds > 0:
+                ttl_lower_bound = end_date - timedelta(seconds=max_ttl_seconds)
+            else:
+                # Keep default window bounded to avoid unbounded scans by default.
+                ttl_lower_bound = end_date - timedelta(days=30)
+
+            # If user provided start_date, use the max of user start_date and ttl_lower_bound.
+            if start_date is not None:
+                if start_date < ttl_lower_bound:
+                    import warnings
+                    warnings.warn(
+                        f"Provided start_date ({start_date}) is earlier than TTL-based lower bound ({ttl_lower_bound}). Overriding start_date to {ttl_lower_bound}."
+                    )
+                start_date = max(start_date, ttl_lower_bound)
+            else:
+                start_date = ttl_lower_bound
-            # When start_date is not provided, choose a conservative lower bound using max TTL, otherwise fall back.
-            if start_date is None:
-                max_ttl_seconds = 0
-                for fv in feature_views:
-                    if fv.ttl and isinstance(fv.ttl, timedelta):
-                        max_ttl_seconds = max(
-                            max_ttl_seconds, int(fv.ttl.total_seconds())
-                        )
-                if max_ttl_seconds > 0:
-                    start_date = end_date - timedelta(seconds=max_ttl_seconds)
-                else:
-                    # Keep default window bounded to avoid unbounded scans by default.
-                    start_date = end_date - timedelta(days=30)
+            # Compute TTL-based lower bound for start_date.
+            max_ttl_seconds = 0
+            for fv in feature_views:
+                if fv.ttl and isinstance(fv.ttl, timedelta):
+                    max_ttl_seconds = max(
+                        max_ttl_seconds, int(fv.ttl.total_seconds())
+                    )
+            if max_ttl_seconds > 0:
+                ttl_lower_bound = end_date - timedelta(seconds=max_ttl_seconds)
+            else:
+                # Keep default window bounded to avoid unbounded scans by default.
+                ttl_lower_bound = end_date - timedelta(days=30)
+
+            # If user provided start_date, use the max of user start_date and ttl_lower_bound.
+            if start_date is not None:
+                if start_date < ttl_lower_bound:
+                    import warnings
+                    warnings.warn(
+                        f"Provided start_date ({start_date}) is earlier than TTL-based lower bound ({ttl_lower_bound}). Overriding start_date to {ttl_lower_bound}."
+                    )
+                start_date = max(start_date, ttl_lower_bound)
+            else:
+                start_date = ttl_lower_bound
+            # Minimal synthetic entity_df: one timestamp row; join keys are not materialized here on purpose to avoid
+            # accidental dependence on specific feature view schemas at this layer.
+            entity_df = pd.DataFrame(
+                {DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL: [end_date]}
             )
+        else:
+            if not isinstance(entity_df, pd.DataFrame) and not isinstance(
+                entity_df, dd.DataFrame
+            ):
+                raise ValueError(
+                    f"Please provide an entity_df of type {type(pd.DataFrame)} or dask.dataframe instead of type {type(entity_df)}"
-                    f"Please provide an entity_df of type {type(pd.DataFrame)} or dask.dataframe instead of type {type(entity_df)}"
+                    f"Please provide an entity_df of type pd.DataFrame or dask.dataframe.DataFrame instead of type {type(entity_df)}"
-                    f"Please provide an entity_df of type {type(pd.DataFrame)} or dask.dataframe instead of type {type(entity_df)}"
+                    f"Please provide an entity_df of type pd.DataFrame or dask.dataframe.DataFrame instead of type {type(entity_df)}"
+                )
         entity_df_event_timestamp_col = DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL  # local modifiable copy of global variable
         if entity_df_event_timestamp_col not in entity_df.columns:
             datetime_columns = entity_df.select_dtypes(
@@ -171,8 +203,12 @@ def get_historical_features(
             registry.list_on_demand_feature_views(config.project),
         )
 
-        entity_df_event_timestamp_range = _get_entity_df_event_timestamp_range(
-            entity_df, entity_df_event_timestamp_col
+        entity_df_event_timestamp_range = (
+            (start_date, end_date)
+            if non_entity_mode
+            else _get_entity_df_event_timestamp_range(
+                entity_df, entity_df_event_timestamp_col
+            )
         )
 
         # Create lazy function that is only called from the RetrievalJob object
@@ -260,7 +296,20 @@ def evaluate_historical_retrieval():
                     full_feature_names,
                 )
 
-                df_to_join = _merge(entity_df_with_features, df_to_join, join_keys)
+                # df_to_join = _merge(entity_df_with_features, df_to_join, join_keys)
+
-                # df_to_join = _merge(entity_df_with_features, df_to_join, join_keys)
-                # df_to_join = _merge(entity_df_with_features, df_to_join, join_keys)
+                # In non-entity mode, if the synthetic entity_df lacks join keys, cross join to build a snapshot
+                # of all entities as-of the requested timestamp, then rely on TTL and deduplication to select
+                # the appropriate latest rows per entity.
+                current_join_keys = join_keys
+                if non_entity_mode and any(
+                    k not in entity_df_with_features.columns for k in join_keys
+                ):
-                if non_entity_mode and any(
-                    k not in entity_df_with_features.columns for k in join_keys
-                ):
+                if non_entity_mode:
-                if non_entity_mode and any(
-                    k not in entity_df_with_features.columns for k in join_keys
-                ):
+                if non_entity_mode:
+                    current_join_keys = []
+
+                df_to_join = _merge(
+                    entity_df_with_features, df_to_join, current_join_keys
+                )
 
                 df_to_join = _normalize_timestamp(
                     df_to_join, timestamp_field, created_timestamp_column

@@ -0,0 +1,70 @@
+from datetime import datetime, timezone
+from unittest.mock import MagicMock
+
+from feast.entity import Entity
+from feast.feature_view import FeatureView, Field
+from feast.infra.offline_stores.dask import (
+    DaskOfflineStore,
+    DaskOfflineStoreConfig,
+)
+from feast.infra.offline_stores.file_source import FileSource
+from feast.repo_config import RepoConfig
+from feast.types import Float32, ValueType
+
+
+def _mock_dask_offline_store_config():
+    return DaskOfflineStoreConfig(type="dask")
+
+
+def _mock_entity():
+    return [
+        Entity(
+            name="driver_id",
+            join_keys=["driver_id"],
+            description="Driver ID",
+            value_type=ValueType.INT64,
+        )
+    ]
+
+
+def _mock_feature_view():
+    return FeatureView(
+        name="driver_stats",
+        entities=_mock_entity(),
+        schema=[
+            Field(name="conv_rate", dtype=Float32),
+        ],
+        source=FileSource(
+            path="dummy.parquet",  # not read in this test
+            timestamp_field="event_timestamp",
+        ),
+    )
+
+
+def test_dask_non_entity_historical_retrieval_accepts_dates():
+    repo_config = RepoConfig(
+        project="test_project",
+        registry="test_registry",
+        provider="local",
+        offline_store=_mock_dask_offline_store_config(),
+    )
+
+    fv = _mock_feature_view()
+
+    # Expect this to work once non-entity mode is implemented for Dask-based store
-    # Expect this to work once non-entity mode is implemented for Dask-based store
+    # Verify that non-entity mode (entity_df=None) accepts start_date and end_date parameters
-    # Expect this to work once non-entity mode is implemented for Dask-based store
+    # Verify that non-entity mode (entity_df=None) accepts start_date and end_date parameters
+    retrieval_job = DaskOfflineStore.get_historical_features(
+        config=repo_config,
+        feature_views=[fv],
+        feature_refs=["driver_stats:conv_rate"],
+        entity_df=None,  # start/end-only mode
+        registry=MagicMock(),
+        project="test_project",
+        full_feature_names=False,
+        start_date=datetime(2023, 1, 1, tzinfo=timezone.utc),
+        end_date=datetime(2023, 1, 2, tzinfo=timezone.utc),
+    )
+
+    # When implemented, should return a RetrievalJob
-    # When implemented, should return a RetrievalJob
+    # Should return a RetrievalJob instance
-    # When implemented, should return a RetrievalJob
+    # Should return a RetrievalJob instance
+    from feast.infra.offline_stores.offline_store import RetrievalJob
+
+    assert isinstance(retrieval_job, RetrievalJob)