Skip to content

Commit d6b21c2

Browse files
authored
Add names to dataframe (#235)
Improve the Observations DataFrame to include `entity` and`variable` names. --- - Adds `add_entity_names_to_observations_dataframe` which inserts resolved names for specific columns in a dataframe - Adds `flatten_names_dictionary` which converts a dictionary of `Name` objects into a flat dcid->name dict. - Modifies `.observations_dataframe` in order to insert names for `entity` and `variable` for a more user-friendly DataFrame
1 parent 0ebdf0b commit d6b21c2

File tree

4 files changed

+127
-17
lines changed

4 files changed

+127
-17
lines changed

datacommons_client/client.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from datacommons_client.endpoints.observation import ObservationEndpoint
66
from datacommons_client.endpoints.payloads import ObservationDate
77
from datacommons_client.endpoints.resolve import ResolveEndpoint
8+
from datacommons_client.utils.dataframes import add_entity_names_to_observations_dataframe
89
from datacommons_client.utils.decorators import requires_pandas
910
from datacommons_client.utils.error_handling import NoDataForPropertyError
1011

@@ -131,7 +132,7 @@ def observations_dataframe(
131132
date (ObservationDate | str): The date for which observations are requested. It can be
132133
a specific date, "all" to retrieve all observations, or "latest" to get the most recent observations.
133134
entity_dcids (Literal["all"] | list[str], optional): The entity DCIDs for which to retrieve data.
134-
Defaults to "all".
135+
Defaults to "all".
135136
entity_type (Optional[str]): The type of entities to filter by when `entity_dcids="all"`.
136137
Required if `entity_dcids="all"`. Defaults to None.
137138
parent_entity (Optional[str]): The parent entity under which the target entities fall.
@@ -181,4 +182,14 @@ def observations_dataframe(
181182
variable_dcids=variable_dcids,
182183
filter_facet_ids=facets)
183184

184-
return pd.DataFrame(observations.to_observation_records())
185+
# Convert the observations to a DataFrame
186+
df = pd.DataFrame(observations.to_observation_records())
187+
188+
# Add entity names to the DataFrame
189+
df = add_entity_names_to_observations_dataframe(
190+
endpoint=self.node,
191+
observations_df=df,
192+
entity_columns=["entity", "variable"],
193+
)
194+
195+
return df

datacommons_client/tests/test_client.py

Lines changed: 26 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,12 @@
44
import pandas as pd
55
import pytest
66

7-
import datacommons_client.client
87
from datacommons_client.client import DataCommonsClient
98
from datacommons_client.endpoints.base import API
109
from datacommons_client.endpoints.node import NodeEndpoint
1110
from datacommons_client.endpoints.observation import ObservationEndpoint
1211
from datacommons_client.endpoints.resolve import ResolveEndpoint
12+
from datacommons_client.models.node import Name
1313
from datacommons_client.utils.error_handling import NoDataForPropertyError
1414

1515

@@ -133,7 +133,8 @@ def test_observations_dataframe_calls_fetch_observations_by_entity(mock_client):
133133
date="latest",
134134
entity_dcids=["entity1", "entity2"],
135135
variable_dcids="var1",
136-
filter_facet_ids=None)
136+
filter_facet_ids=None,
137+
)
137138

138139
assert isinstance(df, pd.DataFrame)
139140
assert df.empty
@@ -159,17 +160,34 @@ def test_observations_dataframe_returns_dataframe_with_expected_columns(
159160
},
160161
]
161162

163+
# Mock entity name lookup to prevent API calls
164+
mock_client.node.fetch_entity_names = MagicMock(return_value={
165+
"entity1": Name(value="Entity One", language="en", property="name"),
166+
"entity2": Name(value="Entity Two", language="en", property="name"),
167+
"var1": Name(value="Variable One", language="en", property="name"),
168+
"var2": Name(value="Variable Two", language="en", property="name"),
169+
},)
170+
162171
df = mock_client.observations_dataframe(variable_dcids="var1",
163172
date="2024",
164173
entity_dcids=["entity1", "entity2"])
165174

166175
assert isinstance(df, pd.DataFrame)
167-
assert set(df.columns) == {"date", "entity", "variable", "value", "unit"}
176+
assert set(df.columns) == {
177+
"date", "entity", "entity_name", "variable", "variable_name", "value",
178+
"unit"
179+
}
168180
assert len(df) == 2
181+
assert df.iloc[0]["entity"] == "entity1"
182+
assert df.iloc[0]["entity_name"] == "Entity One"
183+
assert df.iloc[1]["entity"] == "entity2"
184+
assert df.iloc[1]["entity_name"] == "Entity Two"
169185
assert df.iloc[0]["variable"] == "var1"
186+
assert df.iloc[0]["variable_name"] == "Variable One"
170187
assert df.iloc[0]["value"] == 100
171188
assert df.iloc[0]["unit"] == "unit1"
172189
assert df.iloc[1]["variable"] == "var2"
190+
assert df.iloc[1]["variable_name"] == "Variable Two"
173191
assert df.iloc[1]["value"] == 200
174192
assert df.iloc[1]["unit"] == "unit2"
175193

@@ -219,7 +237,9 @@ def test_observations_dataframe_filters_by_facet_ids(mock_client):
219237
"""Tests that observations_dataframe includes facet filtering when property_filters are used."""
220238
mock_client._find_filter_facet_ids = MagicMock(
221239
return_value=["facet_1", "facet_2"])
222-
mock_client.observation.fetch_observations_by_entity_dcid.return_value.to_observation_records.return_value = []
240+
241+
mock_client.observation.fetch_observations_by_entity_dcid.return_value.to_observation_records.return_value = (
242+
[])
223243

224244
df = mock_client.observations_dataframe(
225245
variable_dcids="var1",
@@ -232,7 +252,8 @@ def test_observations_dataframe_filters_by_facet_ids(mock_client):
232252
variable_dcids="var1",
233253
date="2024",
234254
entity_dcids=["entity1"],
235-
filter_facet_ids=["facet_1", "facet_2"])
255+
filter_facet_ids=["facet_1", "facet_2"],
256+
)
236257
assert isinstance(df, pd.DataFrame)
237258

238259

datacommons_client/utils/data_processing.py

Lines changed: 27 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
import json
33
from typing import Any, Dict
44

5+
from datacommons_client.models.node import Name
6+
57

68
def unpack_arcs(arcs: Dict[str, Any]) -> Any:
79
"""Simplify the 'arcs' structure."""
@@ -99,16 +101,16 @@ def observations_as_records(data: dict, facets: dict) -> list[dict]:
99101
def group_variables_by_entity(
100102
data: dict[str, list[str]]) -> dict[str, list[str]]:
101103
"""Groups variables by the entities they are associated with.
102-
Takes a dictionary mapping statistical variable DCIDs to a list of entity DCIDs,
103-
and returns a new dictionary mapping each entity DCID to a list of statistical
104-
variables available for that entity.
105-
Args:
106-
data: A dictionary where each key is a variable DCID and the value is a list
107-
of entity DCIDs that have observations for that variable.
108-
Returns:
109-
A dictionary where each key is an entity DCID and the value is a list of
110-
variable DCIDs available for that entity.
111-
"""
104+
Takes a dictionary mapping statistical variable DCIDs to a list of entity DCIDs,
105+
and returns a new dictionary mapping each entity DCID to a list of statistical
106+
variables available for that entity.
107+
Args:
108+
data: A dictionary where each key is a variable DCID and the value is a list
109+
of entity DCIDs that have observations for that variable.
110+
Returns:
111+
A dictionary where each key is an entity DCID and the value is a list of
112+
variable DCIDs available for that entity.
113+
"""
112114
result: dict[str, list[str]] = {}
113115
for variable, entities in data.items():
114116
for entity in entities:
@@ -150,3 +152,18 @@ def to_json(self, exclude_none: bool = True) -> str:
150152
str: The JSON string representation of the instance.
151153
"""
152154
return json.dumps(self.to_dict(exclude_none=exclude_none), indent=2)
155+
156+
157+
def flatten_names_dictionary(names_dict: dict[str, Name]) -> dict[str, str]:
158+
"""
159+
Flattens a dictionary which contains Name objects into a flattened dictionary
160+
with DCIDs as keys and names as values.
161+
162+
Args:
163+
names_dict (dict[str, Name]): The input dictionary to flatten.
164+
165+
Returns:
166+
dict[str, str]: A flattened dictionary with DCIDs as keys and names as values.
167+
"""
168+
169+
return {dcid: name.to_dict()['value'] for dcid, name in names_dict.items()}
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
from datacommons_client.endpoints.node import NodeEndpoint
2+
from datacommons_client.utils.data_processing import flatten_names_dictionary
3+
4+
try:
5+
import pandas as pd
6+
except ImportError:
7+
pd = None
8+
9+
from datacommons_client.utils.decorators import requires_pandas
10+
11+
12+
@requires_pandas
13+
def add_entity_names_to_observations_dataframe(
14+
endpoint: NodeEndpoint,
15+
observations_df: pd.DataFrame,
16+
entity_columns: str | list[str],
17+
) -> pd.DataFrame:
18+
"""
19+
Adds entity names to the observations DataFrame.
20+
21+
Args:
22+
endpoint (NodeEndpoint): The NodeEndpoint instance for fetching entity names.
23+
observations_df (dict): The DataFrame containing observations.
24+
entity_columns (str | list[str]): The column(s) containing entity DCIDs.
25+
"""
26+
27+
# Guard against empty DataFrame
28+
if observations_df.empty:
29+
return observations_df
30+
31+
if not isinstance(entity_columns, list):
32+
entity_columns = [entity_columns]
33+
34+
for entity_column in entity_columns:
35+
if entity_column not in observations_df.columns:
36+
raise ValueError(
37+
"The specified entity column does not exist in the DataFrame.")
38+
39+
# Get unique entity DCIDs from the DataFrame
40+
unique_values = observations_df[entity_column].dropna().unique().tolist()
41+
42+
# Guard against empty unique values
43+
if not unique_values:
44+
continue
45+
46+
# Fetch entity names from the endpoint
47+
response = endpoint.fetch_entity_names(entity_dcids=unique_values)
48+
49+
# Flatten the response to get a dictionary of names
50+
names = flatten_names_dictionary(response)
51+
52+
# Insert the names into a column next to the entity column
53+
name_column = f"{entity_column}_name"
54+
if name_column not in observations_df.columns:
55+
observations_df.insert(
56+
loc=observations_df.columns.get_loc(entity_column) + 1,
57+
column=name_column,
58+
value=observations_df[entity_column].map(names),
59+
)
60+
61+
return observations_df

0 commit comments

Comments
 (0)