From 3d7d408a5a725f9844560822797f92ae33cf3638 Mon Sep 17 00:00:00 2001 From: Sai Krishnan Chandrasekar Date: Thu, 13 Nov 2025 09:59:16 -0800 Subject: [PATCH 1/6] Add Zarr reader for Crash --- .../crash/conf/config.yaml | 2 +- .../crash/conf/reader/zarr.yaml | 18 + .../crash/tests/test_zarr_reader.py | 340 ++++++++++++++++++ .../structural_mechanics/crash/zarr_reader.py | 226 ++++++++++++ 4 files changed, 585 insertions(+), 1 deletion(-) create mode 100644 examples/structural_mechanics/crash/conf/reader/zarr.yaml create mode 100644 examples/structural_mechanics/crash/tests/test_zarr_reader.py create mode 100644 examples/structural_mechanics/crash/zarr_reader.py diff --git a/examples/structural_mechanics/crash/conf/config.yaml b/examples/structural_mechanics/crash/conf/config.yaml index 3a3e288d66..7bba46e8c4 100644 --- a/examples/structural_mechanics/crash/conf/config.yaml +++ b/examples/structural_mechanics/crash/conf/config.yaml @@ -25,7 +25,7 @@ experiment_desc: "unified training recipe for crash models" run_desc: "unified training recipe for crash models" defaults: - - reader: vtp #d3plot + - reader: vtp # Options are: vtp, d3plot, zarr - datapipe: point_cloud # will be overridden by model configs - model: transolver_autoregressive_rollout_training - training: default diff --git a/examples/structural_mechanics/crash/conf/reader/zarr.yaml b/examples/structural_mechanics/crash/conf/reader/zarr.yaml new file mode 100644 index 0000000000..733730067f --- /dev/null +++ b/examples/structural_mechanics/crash/conf/reader/zarr.yaml @@ -0,0 +1,18 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. +# SPDX-FileCopyrightText: All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +_target_: zarr_reader.Reader +_convert_: all \ No newline at end of file diff --git a/examples/structural_mechanics/crash/tests/test_zarr_reader.py b/examples/structural_mechanics/crash/tests/test_zarr_reader.py new file mode 100644 index 0000000000..d79f173e77 --- /dev/null +++ b/examples/structural_mechanics/crash/tests/test_zarr_reader.py @@ -0,0 +1,340 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. +# SPDX-FileCopyrightText: All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import tempfile +from pathlib import Path + +import numpy as np +import pytest +import zarr + +# Import functions from zarr_reader +import sys + +sys.path.insert(0, str(Path(__file__).parent.parent)) +import zarr_reader + + +def create_mock_zarr_store( + store_path: Path, + num_timesteps: int = 3, + num_nodes: int = 4, + thickness_value: float = 1.0, +): + """ + Helper function to create a mock Zarr store with crash simulation data. + + Args: + store_path: Path where the Zarr store should be created + num_timesteps: Number of timesteps + num_nodes: Number of nodes + thickness_value: Constant thickness value for all nodes + + Returns: + Tuple of (mesh_pos, node_thickness, edges) arrays that were written + """ + store_path.mkdir(exist_ok=True) + + # Create mock data + mesh_pos = np.random.randn(num_timesteps, num_nodes, 3).astype(np.float32) + node_thickness = np.ones(num_nodes, dtype=np.float32) * thickness_value + edges = np.array([[0, 1], [1, 2], [2, 3], [3, 0]], dtype=np.int64) + + # Write to Zarr store + store = zarr.open(str(store_path), mode="w") + store.create_dataset("mesh_pos", data=mesh_pos, dtype=np.float32) + store.create_dataset("thickness", data=node_thickness, dtype=np.float32) + store.create_dataset("edges", data=edges, dtype=np.int64) + + return mesh_pos, node_thickness, edges + + +@pytest.fixture +def mock_zarr_store(): + """Create a temporary Zarr store with mock crash simulation data.""" + with tempfile.TemporaryDirectory() as temp_dir: + store_path = Path(temp_dir) / "Run001.zarr" + mesh_pos, node_thickness, edges = create_mock_zarr_store( + store_path, thickness_value=2.0 + ) + yield temp_dir, mesh_pos, node_thickness, edges + + +@pytest.fixture +def mock_zarr_directory(): + """Create a directory with multiple Zarr stores.""" + with tempfile.TemporaryDirectory() as temp_dir: + temp_path = Path(temp_dir) + + # Create multiple zarr stores + for i in range(3): + store_path = temp_path / f"Run{i:03d}.zarr" + create_mock_zarr_store(store_path) + + # Create a non-zarr directory (should be ignored) + (temp_path / "NotAZarr").mkdir() + + # Create a regular file (should be ignored) + (temp_path / "some_file.txt").touch() + + yield temp_dir + + +def test_find_zarr_stores(mock_zarr_directory): + """Test that find_zarr_stores correctly identifies Zarr directories.""" + zarr_stores = zarr_reader.find_zarr_stores(mock_zarr_directory) + + assert len(zarr_stores) == 3, f"Expected 3 zarr stores, got {len(zarr_stores)}" + assert all(path.endswith(".zarr") for path in zarr_stores) + assert all("Run" in Path(path).name for path in zarr_stores) + + +def test_find_zarr_stores_empty_directory(): + """Test find_zarr_stores with empty directory.""" + with tempfile.TemporaryDirectory() as temp_dir: + zarr_stores = zarr_reader.find_zarr_stores(temp_dir) + assert len(zarr_stores) == 0, ( + "Should return empty list for directory with no zarr stores" + ) + + +def test_find_zarr_stores_nonexistent_directory(): + """Test find_zarr_stores with nonexistent directory.""" + zarr_stores = zarr_reader.find_zarr_stores("/nonexistent/path") + assert len(zarr_stores) == 0, "Should return empty list for nonexistent directory" + + +def test_load_zarr_store(mock_zarr_store): + """Test loading data from a Zarr store.""" + temp_dir, expected_mesh_pos, expected_thickness, expected_edges = mock_zarr_store + store_path = Path(temp_dir) / "Run001.zarr" + + mesh_pos, edges, point_data_dict = zarr_reader.load_zarr_store(str(store_path)) + + # Check shapes + assert mesh_pos.shape == expected_mesh_pos.shape + assert edges.shape == expected_edges.shape + assert "thickness" in point_data_dict, "Should have thickness in point_data" + assert point_data_dict["thickness"].shape == expected_thickness.shape + + # Check data types + assert mesh_pos.dtype == np.float64, "mesh_pos should be float64" + assert point_data_dict["thickness"].dtype == np.float32, ( + "thickness should be float32" + ) + assert edges.dtype == np.int64, "edges should be int64" + + # Check values + np.testing.assert_array_almost_equal(mesh_pos, expected_mesh_pos) + np.testing.assert_array_almost_equal( + point_data_dict["thickness"], expected_thickness + ) + np.testing.assert_array_equal(edges, expected_edges) + + +def test_load_zarr_store_missing_fields(): + """Test that loading a Zarr store with missing required fields raises KeyError.""" + with tempfile.TemporaryDirectory() as temp_dir: + store_path = Path(temp_dir) / "incomplete.zarr" + store_path.mkdir() + + # Create store with only thickness (missing mesh_pos and edges) + store = zarr.open(str(store_path), mode="w") + store.create_dataset("thickness", data=np.ones(4, dtype=np.float32)) + + # Should raise KeyError for missing mesh_pos + with pytest.raises(KeyError, match="mesh_pos"): + zarr_reader.load_zarr_store(str(store_path)) + + # Test missing edges + store_path2 = Path(temp_dir) / "incomplete2.zarr" + store_path2.mkdir() + store2 = zarr.open(str(store_path2), mode="w") + store2.create_dataset( + "mesh_pos", data=np.random.randn(3, 4, 3).astype(np.float32) + ) + + # Should raise KeyError for missing edges + with pytest.raises(KeyError, match="edges"): + zarr_reader.load_zarr_store(str(store_path2)) + + +def test_load_zarr_store_multiple_point_data_fields(): + """Test that load_zarr_store dynamically reads all point data fields.""" + with tempfile.TemporaryDirectory() as temp_dir: + store_path = Path(temp_dir) / "multi_fields.zarr" + store_path.mkdir() + + # Create store with multiple point data fields + num_nodes = 10 + store = zarr.open(str(store_path), mode="w") + store.create_dataset( + "mesh_pos", data=np.random.randn(3, num_nodes, 3).astype(np.float32) + ) + store.create_dataset("edges", data=np.array([[0, 1]], dtype=np.int64)) + # Add multiple point data fields + store.create_dataset("thickness", data=np.ones(num_nodes, dtype=np.float32)) + store.create_dataset( + "stress", data=np.random.randn(num_nodes).astype(np.float32) + ) + store.create_dataset( + "temperature", data=np.random.randn(num_nodes).astype(np.float32) + ) + # This should be skipped (mesh connectivity, not point data) + store.create_dataset( + "mesh_connectivity_flat", data=np.array([0, 1, 2], dtype=np.int64) + ) + + mesh_pos, edges, point_data_dict = zarr_reader.load_zarr_store(str(store_path)) + + # Should have all three point data fields + assert "thickness" in point_data_dict + assert "stress" in point_data_dict + assert "temperature" in point_data_dict + # Should NOT include mesh connectivity + assert "mesh_connectivity_flat" not in point_data_dict + # Should NOT include mesh_pos or edges + assert "mesh_pos" not in point_data_dict + assert "edges" not in point_data_dict + + # Check that all point data fields have correct shape + for name, data in point_data_dict.items(): + assert data.shape == (num_nodes,), ( + f"{name} should have shape ({num_nodes},)" + ) + assert data.dtype == np.float32, f"{name} should be float32" + + +def test_process_zarr_data(mock_zarr_directory): + """Test processing multiple Zarr stores.""" + srcs, dsts, point_data = zarr_reader.process_zarr_data( + data_dir=mock_zarr_directory, + num_samples=2, + ) + + # Check we got 2 samples + assert len(srcs) == 2, f"Expected 2 samples, got {len(srcs)}" + assert len(dsts) == 2 + assert len(point_data) == 2 + + # Check each sample has correct structure + for i in range(2): + assert srcs[i].ndim == 1, "srcs should be 1D array" + assert dsts[i].ndim == 1, "dsts should be 1D array" + assert len(srcs[i]) == len(dsts[i]), "srcs and dsts should have same length" + + # Check point_data structure + assert "coords" in point_data[i], "point_data should have 'coords' key" + assert "thickness" in point_data[i], "point_data should have 'thickness' key" + + coords = point_data[i]["coords"] + thickness = point_data[i]["thickness"] + + assert coords.ndim == 3, "coords should be [T,N,3]" + assert coords.shape[-1] == 3, "coords last dimension should be 3" + assert thickness.ndim == 1, "thickness should be 1D" + assert len(thickness) == coords.shape[1], ( + "thickness length should match num_nodes" + ) + + +def test_process_zarr_data_no_stores(): + """Test that processing directory with no Zarr stores raises error.""" + with tempfile.TemporaryDirectory() as temp_dir: + with pytest.raises(ValueError, match="No .zarr stores found"): + zarr_reader.process_zarr_data( + data_dir=temp_dir, + num_samples=1, + ) + + +def test_process_zarr_data_validation(): + """Test that process_zarr_data validates data shapes.""" + with tempfile.TemporaryDirectory() as temp_dir: + store_path = Path(temp_dir) / "bad_store.zarr" + store_path.mkdir() + + # Create store with invalid mesh_pos shape (should be [T,N,3]) + store = zarr.open(str(store_path), mode="w") + store.create_dataset( + "mesh_pos", data=np.random.randn(3, 4, 2).astype(np.float32) + ) # Wrong last dim + store.create_dataset("thickness", data=np.ones(4, dtype=np.float32)) + store.create_dataset("edges", data=np.array([[0, 1]], dtype=np.int64)) + + with pytest.raises(ValueError, match="mesh_pos must be"): + zarr_reader.process_zarr_data( + data_dir=temp_dir, + num_samples=1, + ) + + +def test_process_zarr_data_edge_bounds(): + """Test that process_zarr_data validates edge indices are within bounds.""" + with tempfile.TemporaryDirectory() as temp_dir: + store_path = Path(temp_dir) / "bad_edges.zarr" + store_path.mkdir() + + num_nodes = 4 + store = zarr.open(str(store_path), mode="w") + store.create_dataset( + "mesh_pos", data=np.random.randn(3, num_nodes, 3).astype(np.float32) + ) + store.create_dataset("thickness", data=np.ones(num_nodes, dtype=np.float32)) + # Edge references node 10 which is out of bounds + store.create_dataset("edges", data=np.array([[0, 10]], dtype=np.int64)) + + with pytest.raises(ValueError, match="Edge indices out of bounds"): + zarr_reader.process_zarr_data( + data_dir=temp_dir, + num_samples=1, + ) + + +def test_reader_class(mock_zarr_directory): + """Test the Reader class callable interface.""" + reader = zarr_reader.Reader() + + srcs, dsts, point_data = reader( + data_dir=mock_zarr_directory, + num_samples=2, + split="train", + ) + + assert len(srcs) == 2 + assert len(dsts) == 2 + assert len(point_data) == 2 + + +def test_natural_sorting(mock_zarr_directory): + """Test that Zarr stores are sorted naturally (Run1, Run2, ..., Run10).""" + temp_path = Path(mock_zarr_directory) + + # Add more stores with different numbering + for i in [10, 5, 20]: + store_path = temp_path / f"Run{i}.zarr" + create_mock_zarr_store(store_path) + + zarr_stores = zarr_reader.find_zarr_stores(mock_zarr_directory) + store_names = [Path(p).name for p in zarr_stores] + + # Should be sorted: Run000, Run001, Run002, Run5, Run10, Run20 + assert store_names[0] == "Run000.zarr" + assert store_names[1] == "Run001.zarr" + assert store_names[2] == "Run002.zarr" + assert "Run5.zarr" in store_names + assert "Run10.zarr" in store_names + assert "Run20.zarr" in store_names diff --git a/examples/structural_mechanics/crash/zarr_reader.py b/examples/structural_mechanics/crash/zarr_reader.py new file mode 100644 index 0000000000..191494c2a5 --- /dev/null +++ b/examples/structural_mechanics/crash/zarr_reader.py @@ -0,0 +1,226 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. +# SPDX-FileCopyrightText: All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import re +import numpy as np +import zarr + + +def find_zarr_stores(base_data_dir: str) -> list[str]: + """ + Find all Zarr stores (directories ending with .zarr) in the base directory. + + Args: + base_data_dir: Path to directory containing Zarr stores. + + Returns: + List of Zarr store paths sorted naturally. + """ + if not os.path.isdir(base_data_dir): + return [] + + zarr_stores = [ + os.path.join(base_data_dir, f) + for f in os.listdir(base_data_dir) + if f.endswith(".zarr") and os.path.isdir(os.path.join(base_data_dir, f)) + ] + + def natural_key(name): + """Natural sort key to handle numeric sorting.""" + return [ + int(s) if s.isdigit() else s.lower() + for s in re.findall(r"\d+|\D+", os.path.basename(name)) + ] + + return sorted(zarr_stores, key=natural_key) + + +def load_zarr_store(zarr_path: str): + """ + Load mesh positions, edges, and all point data fields from a Zarr store. + + Args: + zarr_path: Path to the Zarr store directory. + + Returns: + mesh_pos: (timesteps, num_nodes, 3) temporal positions + edges: (num_edges, 2) edge connectivity + point_data_dict: Dictionary of all point data fields (e.g., thickness, etc.) + """ + store = zarr.open(zarr_path, mode="r") + + # Read mesh positions (temporal coordinates) + if "mesh_pos" not in store: + raise KeyError(f"'mesh_pos' not found in Zarr store {zarr_path}") + mesh_pos = np.array(store["mesh_pos"][:], dtype=np.float64) + + # Read edges + if "edges" not in store: + raise KeyError(f"'edges' not found in Zarr store {zarr_path}") + edges = np.array(store["edges"][:], dtype=np.int64) + + # Extract all other datasets as point data (excluding mesh-level data) + # Skip: mesh_pos, edges, mesh_connectivity_* (these are not per-node features) + point_data_dict = {} + for name in store.keys(): + if name in ("mesh_pos", "edges"): + continue + if name.startswith("mesh_connectivity_"): + continue + # Read as point data feature + point_data_dict[name] = np.array(store[name][:], dtype=np.float32) + + return mesh_pos, edges, point_data_dict + + +def process_zarr_data( + data_dir: str, + num_samples: int, + logger=None, +): + """ + Process Zarr crash simulation data from a given directory. + + Each .zarr store is treated as one sample. Reads mesh positions, edges, + and all available point data fields (e.g., thickness, etc.) from the Zarr stores. + + Args: + data_dir: Directory containing .zarr stores + num_samples: Maximum number of samples to process + logger: Optional logger for logging progress + + Returns: + srcs: List of source node indices for edges (one array per sample) + dsts: List of destination node indices for edges (one array per sample) + point_data_all: List of dicts with 'coords' and all point data fields + """ + zarr_stores = find_zarr_stores(data_dir) + + if not zarr_stores: + if logger: + logger.error(f"No .zarr stores found in: {data_dir}") + raise ValueError(f"No .zarr stores found in: {data_dir}") + + srcs, dsts = [], [] + point_data_all = [] + + processed_runs = 0 + for zarr_path in zarr_stores: + if processed_runs >= num_samples: + break + + if logger: + logger.info(f"Processing Zarr store: {os.path.basename(zarr_path)}") + + try: + mesh_pos, edges, point_data_dict = load_zarr_store(zarr_path) + + # Validate shapes + if mesh_pos.ndim != 3 or mesh_pos.shape[-1] != 3: + raise ValueError( + f"mesh_pos must be [T,N,3], got {mesh_pos.shape} in {zarr_path}" + ) + + if edges.ndim != 2 or edges.shape[-1] != 2: + raise ValueError( + f"edges must be [E,2], got {edges.shape} in {zarr_path}" + ) + + num_nodes = mesh_pos.shape[1] + + # Validate point data features + for name, data in point_data_dict.items(): + if data.ndim != 1: + raise ValueError( + f"Point data '{name}' must be [N], got {data.shape} in {zarr_path}" + ) + if len(data) != num_nodes: + raise ValueError( + f"Point data '{name}' length {len(data)} doesn't match " + f"number of nodes {num_nodes} in {zarr_path}" + ) + + # Validate edge indices are within bounds + if edges.size > 0: + if edges.min() < 0 or edges.max() >= num_nodes: + raise ValueError( + f"Edge indices out of bounds [0, {num_nodes - 1}] in {zarr_path}" + ) + + # Extract source and destination node indices from edges + src, dst = edges.T + srcs.append(src) + dsts.append(dst) + + # Create record with coordinates and all point data fields + record = {"coords": mesh_pos} + record.update(point_data_dict) # Add all point data features dynamically + point_data_all.append(record) + + processed_runs += 1 + + except Exception as e: + if logger: + logger.error(f"Error processing {zarr_path}: {e}") + raise + + if logger: + logger.info(f"Successfully processed {processed_runs} Zarr stores") + + return srcs, dsts, point_data_all + + +class Reader: + """ + Reader for Zarr crash simulation stores. + + This reader loads preprocessed crash simulation data from Zarr stores + created by the PhysicsNeMo Curator ETL pipeline. + """ + + def __init__(self): + """Initialize the Zarr reader.""" + pass + + def __call__( + self, + data_dir: str, + num_samples: int, + split: str | None = None, + logger=None, + **kwargs, + ): + """ + Load Zarr crash simulation data. + + Args: + data_dir: Directory containing .zarr stores + num_samples: Number of samples to load + split: Data split ('train', 'validation', 'test') - not used for Zarr + logger: Optional logger + **kwargs: Additional arguments (ignored) + + Returns: + srcs: List of source node arrays for graph edges + dsts: List of destination node arrays for graph edges + point_data: List of dicts with 'coords' and all available point data fields + """ + return process_zarr_data( + data_dir=data_dir, + num_samples=num_samples, + logger=logger, + ) From 748c8f95a9143d91205f5d69c91a0a69a9b0436e Mon Sep 17 00:00:00 2001 From: Sai Krishnan Chandrasekar Date: Thu, 13 Nov 2025 10:10:26 -0800 Subject: [PATCH 2/6] Update README --- examples/structural_mechanics/crash/README.md | 121 +++++++++++++++--- 1 file changed, 104 insertions(+), 17 deletions(-) diff --git a/examples/structural_mechanics/crash/README.md b/examples/structural_mechanics/crash/README.md index 1475fa393f..6b8a5ebb3e 100644 --- a/examples/structural_mechanics/crash/README.md +++ b/examples/structural_mechanics/crash/README.md @@ -36,7 +36,7 @@ For an in-depth comparison between the Transolver and MeshGraphNet models and th ```yaml # conf/config.yaml defaults: - - reader: vtp # or d3plot, or your custom reader + - reader: vtp # vtp, zarr, d3plot, or your custom reader - datapipe: point_cloud # or graph - model: transolver_time_conditional # or an MGN variant - training: default @@ -47,7 +47,7 @@ defaults: 2) Point to your datasets and core training knobs. - `conf/training/default.yaml`: - - `raw_data_dir`: path to TRAIN runs (folder of run folders for d3plot, or folder of .vtp files for VTP) + - `raw_data_dir`: path to TRAIN runs (folder of run folders for d3plot, folder of .vtp files for VTP, or folder of .zarr stores for Zarr) - `num_time_steps`: number of frames to use per run - `num_training_samples`: how many runs to load @@ -77,6 +77,7 @@ features: [thickness] # or [] for no features; preserve order if adding more 4) Reader‑specific options (optional). - d3plot: `conf/reader/d3plot.yaml` → `wall_node_disp_threshold` +- VTP and Zarr readers have no additional options (they read pre-processed data) 5) Model config: ensure input dimensions match your features. @@ -127,26 +128,38 @@ This will install: [PhysicsNeMo-Curator](https://github.com/NVIDIA/physicsnemo-curator). Using `PhysicsNeMo-Curator`, crash simulation data from LS-DYNA can be processed into training-ready formats easily. -Currently, this can be used to preprocess d3plot files into VTP. +PhysicsNeMo-Curator can preprocess d3plot files into **VTP** (for visualization and smaller datasets) or **Zarr** (for large-scale ML training). ### Quick Start Install PhysicsNeMo-Curator following [these instructions](https://github.com/NVIDIA/physicsnemo-curator?tab=readme-ov-file#installation-and-usage). -Process your LS-DYNA data: +Process your LS-DYNA data to **VTP format**: ```bash export PYTHONPATH=$PYTHONPATH:examples && -physicsnemo-curator-etl \ - --config-dir=examples/config \ - --config-name=crash_etl \ - etl.source.input_dir=/data/crash_sims/ \ - etl.sink.output_dir=/data/crash_processed_vtp/ \ +physicsnemo-curator-etl \ + --config-dir=examples/structural_mechanics/crash/config \ + --config-name=crash_etl \ + serialization_format=vtp \ + etl.source.input_dir=/data/crash_sims/ \ + serialization_format.sink.output_dir=/data/crash_vtp/ \ etl.processing.num_processes=4 ``` -This will process all LS-DYNA runs in `/data/crash_sims/` and output VTP files to `/data/crash_processed_vtp/`. +Or process to **Zarr format** for large-scale training: + +```bash +export PYTHONPATH=$PYTHONPATH:examples && +physicsnemo-curator-etl \ + --config-dir=examples/structural_mechanics/crash/config \ + --config-name=crash_etl \ + serialization_format=zarr \ + etl.source.input_dir=/data/crash_sims/ \ + serialization_format.sink.output_dir=/data/crash_zarr/ \ + etl.processing.num_processes=4 +``` ### Input Data Structure @@ -165,7 +178,7 @@ crash_sims/ ### Output Formats -#### VTP Format (Recommended for this example) +#### VTP Format Produces single VTP file per run with all timesteps as displacement fields: @@ -179,10 +192,33 @@ crash_processed_vtp/ Each VTP contains: - Reference coordinates at t=0 - Displacement fields: `displacement_t0.000`, `displacement_t0.005`, etc. -- Node thickness values +- Node thickness and other point data features This format is directly compatible with the VTP reader in this example. +#### Zarr Format + +Produces one Zarr store per run with pre-computed graph structure: + +``` +crash_processed_zarr/ +├── Run100.zarr/ +│ ├── mesh_pos # (timesteps, nodes, 3) - temporal positions +│ ├── thickness # (nodes,) - node features +│ └── edges # (num_edges, 2) - pre-computed graph connectivity +├── Run101.zarr/ +└── ... +``` + +Each Zarr store contains: +- `mesh_pos`: Full temporal trajectory (no displacement reconstruction needed) +- `thickness`: Per-node features +- `edges`: Pre-computed edge connectivity (no edge rebuilding during training) + +**NOTE:** All heavy preprocessing (node filtering, edge building, thickness computation) is done once during curation using PhysicsNeMo-Curator. The reader simply loads pre-computed arrays. + +This format is directly compatible with the Zarr reader in this example. + ## Training Training is managed via Hydra configurations located in conf/. @@ -277,14 +313,15 @@ If you use the graph datapipe, the edge list is produced by walking the filtered ### Built‑in VTP reader (PolyData) -In addition to `d3plot`, a lightweight VTP reader is provided in `vtp_reader.py`. It treats each `.vtp` file in a directory as a separate run and expects point displacements to be stored as vector arrays in `poly.point_data` with names like `displacement_t0.000`, `displacement_t0.005`, … (a more permissive fallback of any `displacement_t*` is also supported). The reader: +A lightweight VTP reader is provided in `vtp_reader.py`. It treats each `.vtp` file in a directory as a separate run and expects point displacements to be stored as vector arrays in `poly.point_data` with names like `displacement_t0.000`, `displacement_t0.005`, … (a more permissive fallback of any `displacement_t*` is also supported). The reader: - loads the reference coordinates from `poly.points` - builds absolute positions per timestep as `[t0: coords, t>0: coords + displacement_t]` - extracts cell connectivity from the PolyData faces and converts it to unique edges -- returns `(srcs, dsts, point_data)` where `point_data` contains `'coords': [T, N, 3]` +- extracts all point data fields dynamically (e.g., thickness, modulus) +- returns `(srcs, dsts, point_data)` where `point_data` contains `'coords': [T, N, 3]` and all feature arrays -By default, the VTP reader does not attach additional features; it is compatible with `features: []`. If your `.vtp` files include additional per‑point arrays you would like to model (e.g., thickness or modulus), extend the reader to add those arrays to each run’s record using keys that match your `features` list. The datapipe will then concatenate them in the configured order. +The VTP reader dynamically extracts all non-displacement point data fields from the VTP file and makes them available to the datapipe. If your `.vtp` files include additional per‑point arrays (e.g., thickness or modulus), simply add their names to the `features` list in your datapipe config. Example Hydra configuration for the VTP reader: @@ -304,12 +341,58 @@ defaults: - reader: vtp ``` -And set `features` to empty (or to the names you add in your extended reader) in `conf/datapipe/point_cloud.yaml` or `conf/datapipe/graph.yaml`: +And configure features in `conf/datapipe/point_cloud.yaml` or `conf/datapipe/graph.yaml`: ```yaml -features: [] # or [thickness, Y_modulus] if your reader provides them +features: [thickness] # or [] for no features ``` +### Built‑in Zarr reader + +A Zarr reader provided in `zarr_reader.py`. It reads pre-processed Zarr stores created by PhysicsNeMo-Curator, where all heavy computation (node filtering, edge building, thickness computation) has already been done during the ETL pipeline. The reader: + +- loads pre-computed temporal positions directly from `mesh_pos` (no displacement reconstruction) +- loads pre-computed edges (no connectivity-to-edge conversion needed) +- dynamically extracts all point data fields (thickness, etc.) from the Zarr store +- returns `(srcs, dsts, point_data)` similar to VTP reader + +Data layout expected by Zarr reader: +- `/*.zarr/` (each `.zarr` directory is treated as one run) +- Each Zarr store must contain: + - `mesh_pos`: `[T, N, 3]` temporal positions + - `edges`: `[E, 2]` pre-computed edge connectivity + - Feature arrays (e.g., `thickness`): `[N]` or `[N, K]` per-node features + +Example Hydra configuration for the Zarr reader: + +```yaml +# conf/reader/zarr.yaml +_target_: zarr_reader.Reader +``` + +Select it in `conf/config.yaml`: + +```yaml +defaults: + - reader: zarr # Options are: vtp, d3plot, zarr + - datapipe: point_cloud # will be overridden by model configs + - model: transolver_autoregressive_rollout_training + - training: default + - inference: default + - _self_ +``` + +And configure features in `conf/datapipe/graph.yaml`: + +```yaml +features: [thickness] # Must match fields stored in Zarr +``` + +**Recommended workflow:** +1. Use PhysicsNeMo-Curator to preprocess d3plot → VTP or Zarr once +2. Use corresponding reader for all training/validation +3. Optionally use d3plot reader for quick prototyping on raw data + ### Data layout expected by readers - d3plot reader (`d3plot_reader.py`): @@ -320,6 +403,10 @@ features: [] # or [thickness, Y_modulus] if your reader provides them - `/*.vtp` (each `.vtp` is treated as one run) - Displacements stored as 3‑component arrays in point_data with names like `displacement_t0.000`, `displacement_t0.005`, ... (fallback accepts any `displacement_t*`). +- Zarr reader (`zarr_reader.py`): + - `/*.zarr/` (each `.zarr` directory is treated as one run) + - Contains pre-computed `mesh_pos`, `edges`, and feature arrays + ### Write your own reader To write your own reader, implement a Hydra‑instantiable function or class whose call returns a three‑tuple `(srcs, dsts, point_data)`. The first two entries are lists of integer arrays describing edges per run (they can be empty lists if you are not producing a graph), and `point_data` is a list of Python dicts with one dict per run. Each dict must contain `'coords'` as a `[T, N, 3]` array and one array per feature name listed in `conf/datapipe/*.yaml` under `features`. Feature arrays can be `[N]` or `[N, K]` and should use the same node indexing as `'coords'`. For convenience, a simple class reader can accept the Hydra `split` argument (e.g., "train" or "test") and decide whether to save VTP frames, but this is optional. From e31bf6c21b88324985514b6e6527175bf3248733 Mon Sep 17 00:00:00 2001 From: Sai Krishnan Chandrasekar <157182662+saikrishnanc-nv@users.noreply.github.com> Date: Thu, 13 Nov 2025 10:17:15 -0800 Subject: [PATCH 3/6] Update validation logic of point data in Zarr reader Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com> --- .../structural_mechanics/crash/zarr_reader.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/examples/structural_mechanics/crash/zarr_reader.py b/examples/structural_mechanics/crash/zarr_reader.py index 191494c2a5..c7cedceff0 100644 --- a/examples/structural_mechanics/crash/zarr_reader.py +++ b/examples/structural_mechanics/crash/zarr_reader.py @@ -144,9 +144,22 @@ def process_zarr_data( # Validate point data features for name, data in point_data_dict.items(): - if data.ndim != 1: + for name, data in point_data_dict.items(): + if data.ndim == 1: + if len(data) != num_nodes: + raise ValueError( + f"Point data '{name}' length {len(data)} doesn't match " + f"number of nodes {num_nodes} in {zarr_path}" + ) + elif data.ndim == 2: + if data.shape[0] != num_nodes: + raise ValueError( + f"Point data '{name}' shape {data.shape} doesn't match " + f"number of nodes {num_nodes} in {zarr_path}" + ) + else: raise ValueError( - f"Point data '{name}' must be [N], got {data.shape} in {zarr_path}" + f"Point data '{name}' must be [N] or [N,K], got shape {data.shape} in {zarr_path}" ) if len(data) != num_nodes: raise ValueError( From dede2483af130175288df046d3068560f0fdcc70 Mon Sep 17 00:00:00 2001 From: Sai Krishnan Chandrasekar <157182662+saikrishnanc-nv@users.noreply.github.com> Date: Thu, 13 Nov 2025 10:21:40 -0800 Subject: [PATCH 4/6] Update examples/structural_mechanics/crash/zarr_reader.py Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com> --- examples/structural_mechanics/crash/zarr_reader.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/examples/structural_mechanics/crash/zarr_reader.py b/examples/structural_mechanics/crash/zarr_reader.py index c7cedceff0..ba4ecd993d 100644 --- a/examples/structural_mechanics/crash/zarr_reader.py +++ b/examples/structural_mechanics/crash/zarr_reader.py @@ -143,7 +143,7 @@ def process_zarr_data( num_nodes = mesh_pos.shape[1] # Validate point data features - for name, data in point_data_dict.items(): + # Validate point data features for name, data in point_data_dict.items(): if data.ndim == 1: if len(data) != num_nodes: @@ -161,11 +161,6 @@ def process_zarr_data( raise ValueError( f"Point data '{name}' must be [N] or [N,K], got shape {data.shape} in {zarr_path}" ) - if len(data) != num_nodes: - raise ValueError( - f"Point data '{name}' length {len(data)} doesn't match " - f"number of nodes {num_nodes} in {zarr_path}" - ) # Validate edge indices are within bounds if edges.size > 0: From ec2293f4c1033efc5c1a3cbb5e60ee0d7992f7af Mon Sep 17 00:00:00 2001 From: Sai Krishnan Chandrasekar Date: Thu, 13 Nov 2025 10:39:26 -0800 Subject: [PATCH 5/6] Add a test for 2D feature arrays --- .../crash/tests/test_zarr_reader.py | 40 +++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/examples/structural_mechanics/crash/tests/test_zarr_reader.py b/examples/structural_mechanics/crash/tests/test_zarr_reader.py index d79f173e77..8b5d0fe745 100644 --- a/examples/structural_mechanics/crash/tests/test_zarr_reader.py +++ b/examples/structural_mechanics/crash/tests/test_zarr_reader.py @@ -218,6 +218,46 @@ def test_load_zarr_store_multiple_point_data_fields(): assert data.dtype == np.float32, f"{name} should be float32" +def test_load_zarr_store_2d_feature_arrays(): + """Test that load_zarr_store correctly handles 2D feature arrays [N, K].""" + with tempfile.TemporaryDirectory() as temp_dir: + store_path = Path(temp_dir) / "2d_features.zarr" + store_path.mkdir() + + # Create store with 2D feature array + num_nodes = 8 + feature_dim = 3 + store = zarr.open(str(store_path), mode="w") + store.create_dataset( + "mesh_pos", data=np.random.randn(3, num_nodes, 3).astype(np.float32) + ) + store.create_dataset("edges", data=np.array([[0, 1]], dtype=np.int64)) + # Add 1D feature (thickness) + store.create_dataset("thickness", data=np.ones(num_nodes, dtype=np.float32)) + # Add 2D feature array [N, K] (e.g., stress tensor components) + stress_tensor = np.random.randn(num_nodes, feature_dim).astype(np.float32) + store.create_dataset("stress_tensor", data=stress_tensor) + + mesh_pos, edges, point_data_dict = zarr_reader.load_zarr_store(str(store_path)) + + # Should have both 1D and 2D features + assert "thickness" in point_data_dict + assert "stress_tensor" in point_data_dict + + # Check 1D feature shape + assert point_data_dict["thickness"].shape == (num_nodes,) + assert point_data_dict["thickness"].ndim == 1 + + # Check 2D feature shape + assert point_data_dict["stress_tensor"].shape == (num_nodes, feature_dim) + assert point_data_dict["stress_tensor"].ndim == 2 + + # Verify values match + np.testing.assert_array_almost_equal( + point_data_dict["stress_tensor"], stress_tensor + ) + + def test_process_zarr_data(mock_zarr_directory): """Test processing multiple Zarr stores.""" srcs, dsts, point_data = zarr_reader.process_zarr_data( From c17619073e3ccb3580427b338a51376da3d7157b Mon Sep 17 00:00:00 2001 From: Sai Krishnan Chandrasekar <157182662+saikrishnanc-nv@users.noreply.github.com> Date: Thu, 13 Nov 2025 10:44:37 -0800 Subject: [PATCH 6/6] Update examples/structural_mechanics/crash/zarr_reader.py Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com> --- examples/structural_mechanics/crash/zarr_reader.py | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/structural_mechanics/crash/zarr_reader.py b/examples/structural_mechanics/crash/zarr_reader.py index ba4ecd993d..150ff957a1 100644 --- a/examples/structural_mechanics/crash/zarr_reader.py +++ b/examples/structural_mechanics/crash/zarr_reader.py @@ -142,7 +142,6 @@ def process_zarr_data( num_nodes = mesh_pos.shape[1] - # Validate point data features # Validate point data features for name, data in point_data_dict.items(): if data.ndim == 1: