Skip to content

Commit 90d31a1

Browse files
Implement MDIO Dataset builder to create in-memory instance of schemas.v1.dataset.Dataset (#568)
* schema_v1-dataset_builder-add_dimension * First take on add_dimension(), add_coordinate(), add_variable() * Finished add_dimension, add_coordinate, add_variable * Work on build * Generalize _to_dictionary() * build * Dataset Build - pass one * Revert .container changes * PR review: remove DEVELOPER_NOTES.md * PR Review: add_coordinate() should accept only data_type: ScalarType * PR review: add_variable() data_type remove default * RE review: do not add dimension variable * PR Review: get api version from the package version * PR Review: remove add_dimension_coordinate * PR Review: add_coordinate() remove data_type default value * PR Review: improve unit tests by extracting common functionality in validate* functions * Remove the Dockerfile changes. They are not supposed to be a part of this PR * PR Review: run ruff * PR Review: fix pre-commit errors * remove some noqa overrides --------- Co-authored-by: Altay Sansal <[email protected]>
1 parent 4062a77 commit 90d31a1

8 files changed

+1277
-0
lines changed
Lines changed: 335 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,335 @@
1+
"""Builder pattern implementation for MDIO v1 schema models."""
2+
3+
from datetime import UTC
4+
from datetime import datetime
5+
from enum import Enum
6+
from enum import auto
7+
from importlib import metadata
8+
from typing import Any
9+
from typing import TypeAlias
10+
11+
from pydantic import BaseModel
12+
13+
from mdio.schemas.compressors import ZFP
14+
from mdio.schemas.compressors import Blosc
15+
from mdio.schemas.dimension import NamedDimension
16+
from mdio.schemas.dtype import ScalarType
17+
from mdio.schemas.dtype import StructuredType
18+
from mdio.schemas.metadata import ChunkGridMetadata
19+
from mdio.schemas.metadata import UserAttributes
20+
from mdio.schemas.v1.dataset import Dataset
21+
from mdio.schemas.v1.dataset import DatasetInfo
22+
from mdio.schemas.v1.stats import StatisticsMetadata
23+
from mdio.schemas.v1.units import AllUnits
24+
from mdio.schemas.v1.variable import Coordinate
25+
from mdio.schemas.v1.variable import Variable
26+
27+
AnyMetadataList: TypeAlias = list[
28+
AllUnits | UserAttributes | ChunkGridMetadata | StatisticsMetadata | DatasetInfo
29+
]
30+
CoordinateMetadataList: TypeAlias = list[AllUnits | UserAttributes]
31+
VariableMetadataList: TypeAlias = list[
32+
AllUnits | UserAttributes | ChunkGridMetadata | StatisticsMetadata
33+
]
34+
DatasetMetadataList: TypeAlias = list[DatasetInfo | UserAttributes]
35+
36+
37+
class _BuilderState(Enum):
38+
"""States for the template builder."""
39+
40+
INITIAL = auto()
41+
HAS_DIMENSIONS = auto()
42+
HAS_COORDINATES = auto()
43+
HAS_VARIABLES = auto()
44+
45+
46+
def _get_named_dimension(
47+
dimensions: list[NamedDimension], name: str, size: int | None = None
48+
) -> NamedDimension | None:
49+
"""Get a dimension by name and optional size from the list[NamedDimension]."""
50+
if dimensions is None:
51+
return False
52+
if not isinstance(name, str):
53+
msg = f"Expected str, got {type(name).__name__}"
54+
raise TypeError(msg)
55+
56+
nd = next((dim for dim in dimensions if dim.name == name), None)
57+
if nd is None:
58+
return None
59+
if size is not None and nd.size != size:
60+
msg = f"Dimension {name!r} found but size {nd.size} does not match expected size {size}"
61+
raise ValueError(msg)
62+
return nd
63+
64+
65+
def _to_dictionary(val: BaseModel | dict[str, Any] | AnyMetadataList) -> dict[str, Any]:
66+
"""Convert a dictionary, list or pydantic BaseModel to a dictionary."""
67+
if val is None:
68+
return None
69+
if isinstance(val, BaseModel):
70+
return val.model_dump(mode="json", by_alias=True)
71+
if isinstance(val, dict):
72+
return val
73+
if isinstance(val, list):
74+
metadata_dict = {}
75+
for md in val:
76+
if md is None:
77+
continue
78+
metadata_dict.update(_to_dictionary(md))
79+
return metadata_dict
80+
msg = f"Expected BaseModel, dict or list, got {type(val).__name__}"
81+
raise TypeError(msg)
82+
83+
84+
class MDIODatasetBuilder:
85+
"""Builder for creating MDIO datasets with enforced build order.
86+
87+
This builder implements the builder pattern to create MDIO datasets with a v1 schema.
88+
It enforces a specific build order to ensure valid dataset construction:
89+
1. Must add dimensions first via add_dimension()
90+
2. Can optionally add coordinates via add_coordinate()
91+
3. Must add variables via add_variable()
92+
4. Must call build() to create the dataset.
93+
"""
94+
95+
def __init__(self, name: str, attributes: UserAttributes | None = None):
96+
try:
97+
api_version = metadata.version("multidimio")
98+
except metadata.PackageNotFoundError:
99+
api_version = "unknown"
100+
101+
self._info = DatasetInfo(name=name, api_version=api_version, created_on=datetime.now(UTC))
102+
self._attributes = attributes
103+
self._dimensions: list[NamedDimension] = []
104+
self._coordinates: list[Coordinate] = []
105+
self._variables: list[Variable] = []
106+
self._state = _BuilderState.INITIAL
107+
self._unnamed_variable_counter = 0
108+
109+
def add_dimension(self, name: str, size: int) -> "MDIODatasetBuilder":
110+
"""Add a dimension.
111+
112+
This function be called at least once before adding coordinates or variables.
113+
114+
Args:
115+
name: Name of the dimension
116+
size: Size of the dimension
117+
118+
Raises:
119+
ValueError: If 'name' is not a non-empty string.
120+
if the dimension is already defined.
121+
122+
Returns:
123+
self: Returns self for method chaining
124+
"""
125+
if not name:
126+
msg = "'name' must be a non-empty string"
127+
raise ValueError(msg)
128+
129+
# Validate that the dimension is not already defined
130+
old_var = next((e for e in self._dimensions if e.name == name), None)
131+
if old_var is not None:
132+
msg = "Adding dimension with the same name twice is not allowed"
133+
raise ValueError(msg)
134+
135+
dim = NamedDimension(name=name, size=size)
136+
self._dimensions.append(dim)
137+
self._state = _BuilderState.HAS_DIMENSIONS
138+
return self
139+
140+
def add_coordinate( # noqa: PLR0913
141+
self,
142+
name: str,
143+
*,
144+
long_name: str = None,
145+
dimensions: list[str],
146+
data_type: ScalarType,
147+
compressor: Blosc | ZFP | None = None,
148+
metadata_info: CoordinateMetadataList | None = None,
149+
) -> "MDIODatasetBuilder":
150+
"""Add a coordinate after adding at least one dimension.
151+
152+
This function must be called after all required dimensions are added via add_dimension().
153+
This call will create a coordinate variable.
154+
155+
Args:
156+
name: Name of the coordinate
157+
long_name: Optional long name for the coordinate
158+
dimensions: List of dimension names that the coordinate is associated with
159+
data_type: Data type for the coordinate (defaults to FLOAT32)
160+
compressor: Compressor used for the variable (defaults to None)
161+
metadata_info: Optional metadata information for the coordinate
162+
163+
Raises:
164+
ValueError: If no dimensions have been added yet.
165+
If 'name' is not a non-empty string.
166+
If 'dimensions' is not a non-empty list.
167+
If the coordinate is already defined.
168+
If any referenced dimension is not already defined.
169+
170+
Returns:
171+
self: Returns self for method chaining
172+
"""
173+
if self._state == _BuilderState.INITIAL:
174+
msg = "Must add at least one dimension before adding coordinates"
175+
raise ValueError(msg)
176+
if not name:
177+
msg = "'name' must be a non-empty string"
178+
raise ValueError(msg)
179+
if dimensions is None or not dimensions:
180+
msg = "'dimensions' must be a non-empty list"
181+
raise ValueError(msg)
182+
old_var = next((e for e in self._coordinates if e.name == name), None)
183+
# Validate that the coordinate is not already defined
184+
if old_var is not None:
185+
msg = "Adding coordinate with the same name twice is not allowed"
186+
raise ValueError(msg)
187+
188+
# Validate that all referenced dimensions are already defined
189+
named_dimensions = []
190+
for dim_name in dimensions:
191+
nd = _get_named_dimension(self._dimensions, dim_name)
192+
if nd is None:
193+
msg = f"Pre-existing dimension named {dim_name!r} is not found"
194+
raise ValueError(msg)
195+
named_dimensions.append(nd)
196+
197+
meta_dict = _to_dictionary(metadata_info)
198+
coord = Coordinate(
199+
name=name,
200+
longName=long_name,
201+
dimensions=named_dimensions,
202+
compressor=compressor,
203+
dataType=data_type,
204+
metadata=meta_dict,
205+
)
206+
self._coordinates.append(coord)
207+
208+
# Add a coordinate variable to the dataset
209+
self.add_variable(
210+
name=coord.name,
211+
long_name=f"'{coord.name}' coordinate variable",
212+
dimensions=dimensions, # dimension names (list[str])
213+
data_type=coord.data_type,
214+
compressor=compressor,
215+
coordinates=[name], # Use the coordinate name as a reference
216+
metadata_info=coord.metadata,
217+
)
218+
219+
self._state = _BuilderState.HAS_COORDINATES
220+
return self
221+
222+
def add_variable( # noqa: PLR0913
223+
self,
224+
name: str,
225+
*,
226+
long_name: str = None,
227+
dimensions: list[str],
228+
data_type: ScalarType | StructuredType,
229+
compressor: Blosc | ZFP | None = None,
230+
coordinates: list[str] | None = None,
231+
metadata_info: VariableMetadataList | None = None,
232+
) -> "MDIODatasetBuilder":
233+
"""Add a variable after adding at least one dimension and, optionally, coordinate.
234+
235+
This function must be called after all required dimensions are added via add_dimension()
236+
This function must be called after all required coordinates are added via add_coordinate().
237+
238+
If this function is called with a single dimension name that matches the variable name,
239+
it will create a dimension variable. Dimension variables are special variables that
240+
represent sampling along a dimension.
241+
242+
Args:
243+
name: Name of the variable
244+
long_name: Optional long name for the variable
245+
dimensions: List of dimension names that the variable is associated with
246+
data_type: Data type for the variable (defaults to FLOAT32)
247+
compressor: Compressor used for the variable (defaults to None)
248+
coordinates: List of coordinate names that the variable is associated with
249+
(defaults to None, meaning no coordinates)
250+
metadata_info: Optional metadata information for the variable
251+
252+
Raises:
253+
ValueError: If no dimensions have been added yet.
254+
If 'name' is not a non-empty string.
255+
If 'dimensions' is not a non-empty list.
256+
If the variable is already defined.
257+
If any referenced dimension is not already defined.
258+
If any referenced coordinate is not already defined.
259+
260+
Returns:
261+
self: Returns self for method chaining.
262+
"""
263+
if self._state == _BuilderState.INITIAL:
264+
msg = "Must add at least one dimension before adding variables"
265+
raise ValueError(msg)
266+
if not name:
267+
msg = "'name' must be a non-empty string"
268+
raise ValueError(msg)
269+
if dimensions is None or not dimensions:
270+
msg = "'dimensions' must be a non-empty list"
271+
raise ValueError(msg)
272+
273+
# Validate that the variable is not already defined
274+
old_var = next((e for e in self._variables if e.name == name), None)
275+
if old_var is not None:
276+
msg = "Adding variable with the same name twice is not allowed"
277+
raise ValueError(msg)
278+
279+
# Validate that all referenced dimensions are already defined
280+
named_dimensions = []
281+
for dim_name in dimensions:
282+
nd = _get_named_dimension(self._dimensions, dim_name)
283+
if nd is None:
284+
msg = f"Pre-existing dimension named {dim_name!r} is not found"
285+
raise ValueError(msg)
286+
named_dimensions.append(nd)
287+
288+
coordinate_objs: list[Coordinate] = []
289+
# Validate that all referenced coordinates are already defined
290+
if coordinates is not None:
291+
for coord in coordinates:
292+
c: Coordinate = next((c for c in self._coordinates if c.name == coord), None)
293+
if c is not None:
294+
coordinate_objs.append(c)
295+
else:
296+
msg = f"Pre-existing coordinate named {coord!r} is not found"
297+
raise ValueError(msg)
298+
299+
# If this is a dimension coordinate variable, embed the Coordinate into it
300+
if coordinates is not None and len(coordinates) == 1 and coordinates[0] == name:
301+
coordinates = coordinate_objs
302+
303+
meta_dict = _to_dictionary(metadata_info)
304+
var = Variable(
305+
name=name,
306+
long_name=long_name,
307+
dimensions=named_dimensions,
308+
data_type=data_type,
309+
compressor=compressor,
310+
coordinates=coordinates,
311+
metadata=meta_dict,
312+
)
313+
self._variables.append(var)
314+
315+
self._state = _BuilderState.HAS_VARIABLES
316+
return self
317+
318+
def build(self) -> Dataset:
319+
"""Build the final dataset.
320+
321+
This function must be called after at least one dimension is added via add_dimension().
322+
It will create a Dataset object with all added dimensions, coordinates, and variables.
323+
324+
Raises:
325+
ValueError: If no dimensions have been added yet.
326+
327+
Returns:
328+
Dataset: The built dataset with all added dimensions, coordinates, and variables.
329+
"""
330+
if self._state == _BuilderState.INITIAL:
331+
msg = "Must add at least one dimension before building"
332+
raise ValueError(msg)
333+
334+
var_meta_dict = _to_dictionary([self._info, self._attributes])
335+
return Dataset(variables=self._variables, metadata=var_meta_dict)

tests/unit/v1/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
"""Unit tests for parts of the MDIO package related to the v1 schema."""

0 commit comments

Comments
 (0)