Add names to dataframe (#235)

jm-rivera · web-flow · commit d6b21c2198c8 · 2025-04-08T22:04:52.000+02:00
Improve the Observations DataFrame to include `entity` and`variable` names.

---
- Adds `add_entity_names_to_observations_dataframe` which inserts
resolved names for specific columns in a dataframe
- Adds `flatten_names_dictionary` which converts a dictionary of `Name`
objects into a flat dcid-&gt;name dict.
- Modifies `.observations_dataframe` in order to insert names for
`entity` and `variable` for a more user-friendly DataFrame
diff --git a/datacommons_client/client.py b/datacommons_client/client.py
@@ -5,6 +5,7 @@
 from datacommons_client.endpoints.observation import ObservationEndpoint
 from datacommons_client.endpoints.payloads import ObservationDate
 from datacommons_client.endpoints.resolve import ResolveEndpoint
+from datacommons_client.utils.dataframes import add_entity_names_to_observations_dataframe
 from datacommons_client.utils.decorators import requires_pandas
 from datacommons_client.utils.error_handling import NoDataForPropertyError
 
@@ -131,7 +132,7 @@ def observations_dataframe(
         date (ObservationDate | str): The date for which observations are requested. It can be
             a specific date, "all" to retrieve all observations, or "latest" to get the most recent observations.
         entity_dcids (Literal["all"] | list[str], optional): The entity DCIDs for which to retrieve data.
-            Defaults to "all". 
+            Defaults to "all".
         entity_type (Optional[str]): The type of entities to filter by when `entity_dcids="all"`.
             Required if `entity_dcids="all"`. Defaults to None.
         parent_entity (Optional[str]): The parent entity under which the target entities fall.
@@ -181,4 +182,14 @@ def observations_dataframe(
           variable_dcids=variable_dcids,
           filter_facet_ids=facets)
 
-    return pd.DataFrame(observations.to_observation_records())
+    # Convert the observations to a DataFrame
+    df = pd.DataFrame(observations.to_observation_records())
+
+    # Add entity names to the DataFrame
+    df = add_entity_names_to_observations_dataframe(
+        endpoint=self.node,
+        observations_df=df,
+        entity_columns=["entity", "variable"],
+    )
+
+    return df
diff --git a/datacommons_client/tests/test_client.py b/datacommons_client/tests/test_client.py
@@ -4,12 +4,12 @@
 import pandas as pd
 import pytest
 
-import datacommons_client.client
 from datacommons_client.client import DataCommonsClient
 from datacommons_client.endpoints.base import API
 from datacommons_client.endpoints.node import NodeEndpoint
 from datacommons_client.endpoints.observation import ObservationEndpoint
 from datacommons_client.endpoints.resolve import ResolveEndpoint
+from datacommons_client.models.node import Name
 from datacommons_client.utils.error_handling import NoDataForPropertyError
 
 
@@ -133,7 +133,8 @@ def test_observations_dataframe_calls_fetch_observations_by_entity(mock_client):
       date="latest",
       entity_dcids=["entity1", "entity2"],
       variable_dcids="var1",
-      filter_facet_ids=None)
+      filter_facet_ids=None,
+  )
 
   assert isinstance(df, pd.DataFrame)
   assert df.empty
@@ -159,17 +160,34 @@ def test_observations_dataframe_returns_dataframe_with_expected_columns(
       },
   ]
 
+  # Mock entity name lookup to prevent API calls
+  mock_client.node.fetch_entity_names = MagicMock(return_value={
+      "entity1": Name(value="Entity One", language="en", property="name"),
+      "entity2": Name(value="Entity Two", language="en", property="name"),
+      "var1": Name(value="Variable One", language="en", property="name"),
+      "var2": Name(value="Variable Two", language="en", property="name"),
+  },)
+
   df = mock_client.observations_dataframe(variable_dcids="var1",
                                           date="2024",
                                           entity_dcids=["entity1", "entity2"])
 
   assert isinstance(df, pd.DataFrame)
-  assert set(df.columns) == {"date", "entity", "variable", "value", "unit"}
+  assert set(df.columns) == {
+      "date", "entity", "entity_name", "variable", "variable_name", "value",
+      "unit"
+  }
   assert len(df) == 2
+  assert df.iloc[0]["entity"] == "entity1"
+  assert df.iloc[0]["entity_name"] == "Entity One"
+  assert df.iloc[1]["entity"] == "entity2"
+  assert df.iloc[1]["entity_name"] == "Entity Two"
   assert df.iloc[0]["variable"] == "var1"
+  assert df.iloc[0]["variable_name"] == "Variable One"
   assert df.iloc[0]["value"] == 100
   assert df.iloc[0]["unit"] == "unit1"
   assert df.iloc[1]["variable"] == "var2"
+  assert df.iloc[1]["variable_name"] == "Variable Two"
   assert df.iloc[1]["value"] == 200
   assert df.iloc[1]["unit"] == "unit2"
 
@@ -219,7 +237,9 @@ def test_observations_dataframe_filters_by_facet_ids(mock_client):
   """Tests that observations_dataframe includes facet filtering when property_filters are used."""
   mock_client._find_filter_facet_ids = MagicMock(
       return_value=["facet_1", "facet_2"])
-  mock_client.observation.fetch_observations_by_entity_dcid.return_value.to_observation_records.return_value = []
+
+  mock_client.observation.fetch_observations_by_entity_dcid.return_value.to_observation_records.return_value = (
+      [])
 
   df = mock_client.observations_dataframe(
       variable_dcids="var1",
@@ -232,7 +252,8 @@ def test_observations_dataframe_filters_by_facet_ids(mock_client):
       variable_dcids="var1",
       date="2024",
       entity_dcids=["entity1"],
-      filter_facet_ids=["facet_1", "facet_2"])
+      filter_facet_ids=["facet_1", "facet_2"],
+  )
   assert isinstance(df, pd.DataFrame)
 
 
diff --git a/datacommons_client/utils/data_processing.py b/datacommons_client/utils/data_processing.py
@@ -2,6 +2,8 @@
 import json
 from typing import Any, Dict
 
+from datacommons_client.models.node import Name
+
 
 def unpack_arcs(arcs: Dict[str, Any]) -> Any:
   """Simplify the 'arcs' structure."""
@@ -99,16 +101,16 @@ def observations_as_records(data: dict, facets: dict) -> list[dict]:
 def group_variables_by_entity(
     data: dict[str, list[str]]) -> dict[str, list[str]]:
   """Groups variables by the entities they are associated with.
-      Takes a dictionary mapping statistical variable DCIDs to a list of entity DCIDs,
-      and returns a new dictionary mapping each entity DCID to a list of statistical
-      variables available for that entity.
-      Args:
-          data: A dictionary where each key is a variable DCID and the value is a list
-              of entity DCIDs that have observations for that variable.
-      Returns:
-          A dictionary where each key is an entity DCID and the value is a list of
-          variable DCIDs available for that entity.
-      """
+    Takes a dictionary mapping statistical variable DCIDs to a list of entity DCIDs,
+    and returns a new dictionary mapping each entity DCID to a list of statistical
+    variables available for that entity.
+    Args:
+        data: A dictionary where each key is a variable DCID and the value is a list
+            of entity DCIDs that have observations for that variable.
+    Returns:
+        A dictionary where each key is an entity DCID and the value is a list of
+        variable DCIDs available for that entity.
+    """
   result: dict[str, list[str]] = {}
   for variable, entities in data.items():
     for entity in entities:
@@ -150,3 +152,18 @@ def to_json(self, exclude_none: bool = True) -> str:
             str: The JSON string representation of the instance.
         """
     return json.dumps(self.to_dict(exclude_none=exclude_none), indent=2)
+
+
+def flatten_names_dictionary(names_dict: dict[str, Name]) -> dict[str, str]:
+  """
+    Flattens a dictionary which contains Name objects into a flattened dictionary
+    with DCIDs as keys and names as values.
+
+    Args:
+        names_dict (dict[str, Name]): The input dictionary to flatten.
+
+    Returns:
+        dict[str, str]: A flattened dictionary with DCIDs as keys and names as values.
+    """
+
+  return {dcid: name.to_dict()['value'] for dcid, name in names_dict.items()}
diff --git a/datacommons_client/utils/dataframes.py b/datacommons_client/utils/dataframes.py
@@ -0,0 +1,61 @@
+from datacommons_client.endpoints.node import NodeEndpoint
+from datacommons_client.utils.data_processing import flatten_names_dictionary
+
+try:
+  import pandas as pd
+except ImportError:
+  pd = None
+
+from datacommons_client.utils.decorators import requires_pandas
+
+
+@requires_pandas
+def add_entity_names_to_observations_dataframe(
+    endpoint: NodeEndpoint,
+    observations_df: pd.DataFrame,
+    entity_columns: str | list[str],
+) -> pd.DataFrame:
+  """
+    Adds entity names to the observations DataFrame.
+
+    Args:
+        endpoint (NodeEndpoint): The NodeEndpoint instance for fetching entity names.
+        observations_df (dict): The DataFrame containing observations.
+        entity_columns (str | list[str]): The column(s) containing entity DCIDs.
+    """
+
+  # Guard against empty DataFrame
+  if observations_df.empty:
+    return observations_df
+
+  if not isinstance(entity_columns, list):
+    entity_columns = [entity_columns]
+
+  for entity_column in entity_columns:
+    if entity_column not in observations_df.columns:
+      raise ValueError(
+          "The specified entity column does not exist in the DataFrame.")
+
+    # Get unique entity DCIDs from the DataFrame
+    unique_values = observations_df[entity_column].dropna().unique().tolist()
+
+    # Guard against empty unique values
+    if not unique_values:
+      continue
+
+    # Fetch entity names from the endpoint
+    response = endpoint.fetch_entity_names(entity_dcids=unique_values)
+
+    # Flatten the response to get a dictionary of names
+    names = flatten_names_dictionary(response)
+
+    # Insert the names into a column next to the entity column
+    name_column = f"{entity_column}_name"
+    if name_column not in observations_df.columns:
+      observations_df.insert(
+          loc=observations_df.columns.get_loc(entity_column) + 1,
+          column=name_column,
+          value=observations_df[entity_column].map(names),
+      )
+
+  return observations_df