From 58bfcc2ddbac9b887d59245b45009d6e29e31f07 Mon Sep 17 00:00:00 2001 From: Rishi Ranade Date: Wed, 19 Nov 2025 07:36:38 -0800 Subject: [PATCH] adding ball query based local features --- .../transolver/requirements.txt | 4 + .../transolver/src/compute_normalizations.py | 2 +- .../transolver/src/conf/train_surfaceX.yaml | 15 +- .../transolver/src/conf/train_volumeX.yaml | 16 +- .../transolver/src/inference_on_zarr.py | 17 +- .../src/surface_fields_normalization.npz | Bin 1040 -> 1040 bytes .../transolver/src/train.py | 171 +++++++++++------- .../src/volume_fields_normalization.npz | Bin 0 -> 1056 bytes physicsnemo/models/transolver/transolverX.py | 134 +++++++++++++- 9 files changed, 276 insertions(+), 83 deletions(-) create mode 100644 examples/cfd/external_aerodynamics/transolver/src/volume_fields_normalization.npz diff --git a/examples/cfd/external_aerodynamics/transolver/requirements.txt b/examples/cfd/external_aerodynamics/transolver/requirements.txt index ffc351ec7b..61fce05183 100644 --- a/examples/cfd/external_aerodynamics/transolver/requirements.txt +++ b/examples/cfd/external_aerodynamics/transolver/requirements.txt @@ -7,3 +7,7 @@ einops transformer_engine[pytorch] tensorstore zarr>=3.0 +s3fs +treelib +warp-lang +timm \ No newline at end of file diff --git a/examples/cfd/external_aerodynamics/transolver/src/compute_normalizations.py b/examples/cfd/external_aerodynamics/transolver/src/compute_normalizations.py index 749a7ab2f7..a46f2c9813 100644 --- a/examples/cfd/external_aerodynamics/transolver/src/compute_normalizations.py +++ b/examples/cfd/external_aerodynamics/transolver/src/compute_normalizations.py @@ -103,7 +103,7 @@ def compute_mean_std_min_max( return mean, std, min_val, max_val -@hydra.main(version_base="1.3", config_path="conf", config_name="train_surface") +@hydra.main(version_base="1.3", config_path="conf", config_name="train_surfaceX") def main(cfg: DictConfig) -> None: """ Script entry point for computing normalization statistics for a specified field diff --git a/examples/cfd/external_aerodynamics/transolver/src/conf/train_surfaceX.yaml b/examples/cfd/external_aerodynamics/transolver/src/conf/train_surfaceX.yaml index 4fdcbb8219..7939e4dbf8 100644 --- a/examples/cfd/external_aerodynamics/transolver/src/conf/train_surfaceX.yaml +++ b/examples/cfd/external_aerodynamics/transolver/src/conf/train_surfaceX.yaml @@ -18,12 +18,12 @@ output_dir: "runs" checkpoint_dir: null # Optional: set custom checkpoint path, defaults to output_dir -run_id: "surfaceX/bfloat16" +run_id: "surfaceX/runs1" profile: false # Training configuration training: - precision: bfloat16 # float32, float16, bfloat16, or float8 + precision: float32 # float32, float16, bfloat16, or float8 num_epochs: 501 # Add one to save at 250 save_interval: 25 # Save checkpoint every N epochs compile: false @@ -44,7 +44,10 @@ model: slice_num: 512 # Number of slices in physics attention use_te: false # Use transformer engine plus: false - + include_local_features: true # use local features + radii: [0.05, 0.25] # radius for local features + neighbors_in_radius: [8, 32] # neighbors in radius for local features + n_hidden_local: 512 # hidden dimension for local features # StepLR scheduler: Decays the learning rate by gamma every step_size epochs scheduler: @@ -65,14 +68,14 @@ optimizer: # Data configuration data: train: - data_path: /lustre/fsw/portfolios/coreai/projects/coreai_modulus_cae/drivaer_aws/domino/train/ + data_path: /lustre/fsw/portfolios/coreai/projects/coreai_modulus_cae/datasets/drivaer_aws/domino/train/ val: - data_path: /lustre/fsw/portfolios/coreai/projects/coreai_modulus_cae/drivaer_aws/domino/val/ + data_path: /lustre/fsw/portfolios/coreai/projects/coreai_modulus_cae/datasets/drivaer_aws/domino/val/ max_workers: 8 normalization_dir: "src/" # Directory for normalization files preload_depth: 1 pin_memory: true - resolution: 300_000 + resolution: 80_000 mode: surface # Preprocessing switches: # (Changing thes will change the embedding dim) diff --git a/examples/cfd/external_aerodynamics/transolver/src/conf/train_volumeX.yaml b/examples/cfd/external_aerodynamics/transolver/src/conf/train_volumeX.yaml index 97388171b8..d9c832765c 100644 --- a/examples/cfd/external_aerodynamics/transolver/src/conf/train_volumeX.yaml +++ b/examples/cfd/external_aerodynamics/transolver/src/conf/train_volumeX.yaml @@ -18,14 +18,14 @@ output_dir: "runs" checkpoint_dir: null # Optional: set custom checkpoint path, defaults to output_dir -run_id: "volumeX/fake-name" +run_id: "volumeX/runs1" profile: false # Training configuration training: - precision: bfloat16 # float32, float16, bfloat16, or float8 - num_epochs: 500 # Add one to save at 250 - save_interval: 10 # Save checkpoint every N epochs + precision: float32 # float32, float16, bfloat16, or float8 + num_epochs: 501 # Add one to save at 250 + save_interval: 25 # Save checkpoint every N epochs compile: false # Model configuration @@ -44,6 +44,10 @@ model: slice_num: 256 # Number of slices in physics attention use_te: false # Use transformer engine plus: false + include_local_features: true # use local features + radii: [0.05, 0.25] # radius for local features + neighbors_in_radius: [8, 32] # neighbors in radius for local features + n_hidden_local: 512 # hidden dimension for local features # scheduler: # name: "OneCycleLR" @@ -70,9 +74,9 @@ optimizer: # Data configuration data: train: - data_path: /lustre/fsw/portfolios/coreai/projects/coreai_modulus_cae/drivaer_aws/domino/train/ + data_path: /lustre/fsw/portfolios/coreai/projects/coreai_modulus_cae/datasets/drivaer_aws/domino/train/ val: - data_path: /lustre/fsw/portfolios/coreai/projects/coreai_modulus_cae/drivaer_aws/domino/val/ + data_path: /lustre/fsw/portfolios/coreai/projects/coreai_modulus_cae/datasets/drivaer_aws/domino/val/ max_workers: 8 normalization_dir: "src/" # Directory for normalization files preload_depth: 1 diff --git a/examples/cfd/external_aerodynamics/transolver/src/inference_on_zarr.py b/examples/cfd/external_aerodynamics/transolver/src/inference_on_zarr.py index bc3115cb37..1659fdef2c 100644 --- a/examples/cfd/external_aerodynamics/transolver/src/inference_on_zarr.py +++ b/examples/cfd/external_aerodynamics/transolver/src/inference_on_zarr.py @@ -40,7 +40,7 @@ create_transolver_dataset, TransolverDataPipe, ) -from train import forward_pass +from train import forward_passX from tabulate import tabulate # import transformer_engine.pytorch as te @@ -162,7 +162,7 @@ def batched_inference_loop( local_batch["geometry"] = batch["geometry"] # Run the forward inference pass: - local_loss, local_metrics, local_preds_targets = forward_pass( + local_loss, local_metrics, local_preds_targets = forward_passX( local_batch, model, precision, @@ -309,7 +309,7 @@ def inference(cfg: DictConfig) -> None: pred_pressure = pred_pressure.reshape(-1) pred_drag_coeff, _, _ = compute_force_coefficients( batch["surface_normals"][0], - batch["surface_areas"], + batch["surface_areas"][0], coeff, pred_pressure, pred_shear, @@ -318,7 +318,7 @@ def inference(cfg: DictConfig) -> None: pred_lift_coeff, _, _ = compute_force_coefficients( batch["surface_normals"][0], - batch["surface_areas"], + batch["surface_areas"][0], coeff, pred_pressure, pred_shear, @@ -333,7 +333,7 @@ def inference(cfg: DictConfig) -> None: true_pressure = true_pressure.reshape(-1) true_drag_coeff, _, _ = compute_force_coefficients( batch["surface_normals"][0], - batch["surface_areas"], + batch["surface_areas"][0], coeff, true_pressure, true_shear, @@ -342,7 +342,7 @@ def inference(cfg: DictConfig) -> None: true_lift_coeff, _, _ = compute_force_coefficients( batch["surface_normals"][0], - batch["surface_areas"], + batch["surface_areas"][0], coeff, true_pressure, true_shear, @@ -352,6 +352,9 @@ def inference(cfg: DictConfig) -> None: pred_lift_coeff = pred_lift_coeff.item() pred_drag_coeff = pred_drag_coeff.item() + true_lift_coeff = true_lift_coeff.item() + true_drag_coeff = true_drag_coeff.item() + # Extract metric values and convert tensors to floats l2_pressure = ( metrics["l2_pressure_surf"].item() @@ -431,7 +434,7 @@ def inference(cfg: DictConfig) -> None: ) -@hydra.main(version_base=None, config_path="conf", config_name="train_surface") +@hydra.main(version_base=None, config_path="conf", config_name="train_surfaceX") def launch(cfg: DictConfig) -> None: """ Launch inference with Hydra configuration. diff --git a/examples/cfd/external_aerodynamics/transolver/src/surface_fields_normalization.npz b/examples/cfd/external_aerodynamics/transolver/src/surface_fields_normalization.npz index b6809d416c9ba267131897355c52b9c8a5e3dee3..228f7550cc4be6993352c81bb56def3b95cd036d 100644 GIT binary patch delta 218 zcmbQhF@ZxUz?+#xmjMD485T-r+@C14Lg^5n_dX`0j$L{&D@^si)a^i3wVM0D%87TD zD@ndIwTrjiWOX7c!^+6f-U?OKzxi`_PhQHnT&cg-^pK3pL8brz delta 218 zcmbQhF@ZxUz?+#xmjMD4878II-J2-1LTSBx&_21xJ-dWsSDW(cwC_MwmGM~3cH*7o zO2L1P?HK(wT1}ggZdJ3;&I(o4%YT6rCNE`NuGBu~>>(MK$$RJTQrb7O#cCg_vfTyG zq9*4tEmyG$+~ct8^;z2_>2f=7@m9M4Z$>6vW>mLLwq!P90a-sepV^QFY(qY?*yLTz N(kx(eK%Do?iU8M}L=gZ0 diff --git a/examples/cfd/external_aerodynamics/transolver/src/train.py b/examples/cfd/external_aerodynamics/transolver/src/train.py index 15e7be4d19..de32f34b0d 100644 --- a/examples/cfd/external_aerodynamics/transolver/src/train.py +++ b/examples/cfd/external_aerodynamics/transolver/src/train.py @@ -17,7 +17,6 @@ import os import time from pathlib import Path -from typing import Literal import torch import hydra @@ -37,7 +36,6 @@ from physicsnemo.datapipes.cae.transolver_datapipe import ( create_transolver_dataset, - TransolverDataPipe, ) from loss import loss_fn from metrics import metrics_fn @@ -54,6 +52,19 @@ from torch.optim import Optimizer from typing import Any, Callable, Sequence +import typing +import collections +from typing import Literal + +torch.serialization.add_safe_globals([omegaconf.listconfig.ListConfig]) +torch.serialization.add_safe_globals([omegaconf.base.ContainerMetadata]) +torch.serialization.add_safe_globals([typing.Any]) +torch.serialization.add_safe_globals([list]) +torch.serialization.add_safe_globals([collections.defaultdict]) +torch.serialization.add_safe_globals([dict]) +torch.serialization.add_safe_globals([int]) +torch.serialization.add_safe_globals([omegaconf.nodes.AnyNode]) +torch.serialization.add_safe_globals([omegaconf.base.Metadata]) class CombinedOptimizer(Optimizer): @@ -153,11 +164,7 @@ def cast_precisions(*tensors: torch.Tensor, precision: str) -> list[torch.Tensor return tensors -def pad_input_for_fp8( - features: torch.Tensor, - embeddings: torch.Tensor, - geometry: torch.Tensor | None = None, -) -> torch.Tensor: +def pad_input_for_fp8(features: torch.Tensor, embeddings: torch.Tensor) -> torch.Tensor: """ Pads the input features tensor so that the concatenated feature and embedding dimension is a multiple of 16, which is required for FP8 operations. Only the features is updated. @@ -175,14 +182,7 @@ def pad_input_for_fp8( features = torch.nn.functional.pad(features, (0, pad_size)) fx_dim = features.shape[-1] + embeddings.shape[-1] - if geometry is not None: - geometry_dim = geometry.shape[-1] if geometry is not None else 0 - if geometry_dim % 16 != 0: - pad_size = 16 - (geometry_dim % 16) - geometry = torch.nn.functional.pad(geometry, (0, pad_size)) - geometry_dim = geometry.shape[-1] - - return features, geometry + return features def unpad_output_for_fp8( @@ -210,8 +210,8 @@ def forward_pass( precision: str, output_pad_size: int | None, dist_manager: DistributedManager, - data_mode: Literal["surface", "volume"], - datapipe: TransolverDataPipe, + cfg: DictConfig, + norm_factors: dict[str, torch.Tensor], ): """ Run the forward pass of the model for one batch, including metrics and loss calculation. @@ -223,43 +223,62 @@ def forward_pass( # Cast precisions: features, embeddings = cast_precisions(features, embeddings, precision=precision) - - if "geometry" in batch.keys(): - (geometry,) = cast_precisions(batch["geometry"], precision=precision) - else: - geometry = None - with get_autocast_context(precision): # For fp8, we may have to pad the inputs: if precision == "float8": - features, geometry = pad_input_for_fp8(features, embeddings, geometry) + features = pad_input_for_fp8(features, embeddings) - if "geometry" in batch.keys(): - outputs = model( - global_embedding=features, local_embedding=embeddings, geometry=geometry - ) - else: - outputs = model(fx=features, embedding=embeddings) + outputs = model(features, embeddings) outputs = unpad_output_for_fp8(outputs, output_pad_size) - loss = loss_fn(outputs, targets, data_mode) + loss = loss_fn(outputs, targets, cfg.data.mode) - air_density = batch["air_density"] if "air_density" in batch.keys() else None - stream_velocity = ( - batch["stream_velocity"] if "stream_velocity" in batch.keys() else None - ) + metrics = metrics_fn(outputs, targets, dist_manager, cfg.data.mode) - unscaled_outputs = datapipe.unscale_model_targets( - outputs, air_density=air_density, stream_velocity=stream_velocity - ) - unscaled_targets = datapipe.unscale_model_targets( - targets, air_density=air_density, stream_velocity=stream_velocity + return loss, metrics + + +def forward_passX( + batch: dict, + model: torch.nn.Module, + precision: str, + output_pad_size: int | None, + dist_manager: DistributedManager, + data_mode: Literal["surface", "volume"], + norm_factors: dict[str, torch.Tensor], +): + """ + Run the forward pass of the model for one batch, including metrics and loss calculation. + """ + + features = batch["fx"] + embeddings = batch["embeddings"] + targets = batch["fields"] + geometry = batch["geometry"] + + # Cast precisions: + features, embeddings, geometry = cast_precisions( + features, embeddings, geometry, precision=precision ) + with get_autocast_context(precision): + # For fp8, we may have to pad the inputs: + # if precision == "float8": + # features = pad_input_for_fp8(features, embeddings) - metrics = metrics_fn(unscaled_outputs, unscaled_targets, dist_manager, data_mode) + outputs = model( + local_embedding=embeddings, + global_embedding=features, + geometry=geometry, + ) - return loss, metrics, (unscaled_outputs, unscaled_targets) + outputs = unpad_output_for_fp8(outputs, output_pad_size) + + loss = loss_fn(outputs, targets, data_mode) + + metrics = metrics_fn(outputs, targets, dist_manager, data_mode) + + return loss, metrics, [outputs, targets] @profile @@ -275,6 +294,7 @@ def train_epoch( epoch: int, cfg: DictConfig, dist_manager: DistributedManager, + norm_factors: dict[str, torch.Tensor], scaler: GradScaler | None = None, ) -> float: """ @@ -292,6 +312,7 @@ def train_epoch( epoch (int): Current epoch number. cfg (DictConfig): Hydra configuration object. dist_manager (DistributedManager): Distributed manager from physicsnemo. + norm_factors (dict[str, torch.Tensor]): Normalization factors for the data. scaler (GradScaler | None, optional): Gradient scaler for mixed precision training. Returns: float: The average training loss for the epoch. @@ -305,16 +326,26 @@ def train_epoch( for i, batch in enumerate(dataloader): # TransolverX has a different forward pass: - - loss, metrics, _ = forward_pass( - batch, - model, - precision, - output_pad_size, - dist_manager, - cfg.data.mode, - dataloader, - ) + if "geometry" in batch.keys(): + loss, metrics, local_preds_targets = forward_passX( + batch, + model, + precision, + output_pad_size, + dist_manager, + cfg.data.mode, + norm_factors, + ) + else: + loss, metrics = forward_pass( + batch, + model, + precision, + output_pad_size, + dist_manager, + cfg, + norm_factors, + ) optimizer.zero_grad() if precision == "float16" and scaler is not None: @@ -395,6 +426,7 @@ def val_epoch( epoch: int, cfg: DictConfig, dist_manager: DistributedManager, + norm_factors: dict[str, torch.Tensor], ) -> float: """ Run validation for one epoch. @@ -409,6 +441,7 @@ def val_epoch( epoch (int): Current epoch number. cfg (DictConfig): Hydra configuration object. dist_manager (DistributedManager): Distributed manager instance. + norm_factors (dict[str, torch.Tensor]): Normalization factors for the data. Returns: float: The average validation loss for the epoch. """ @@ -422,15 +455,27 @@ def val_epoch( start_time = time.time() with torch.no_grad(): # Disable gradient computation for i, batch in enumerate(dataloader): - loss, metrics, _ = forward_pass( - batch, - model, - precision, - output_pad_size, - dist_manager, - cfg.data.mode, - dataloader, - ) + # TransolverX has a different forward pass: + if "geometry" in batch.keys(): + loss, metrics, local_preds_targets = forward_passX( + batch, + model, + precision, + output_pad_size, + dist_manager, + cfg.data.mode, + norm_factors, + ) + else: + loss, metrics = forward_pass( + batch, + model, + precision, + output_pad_size, + dist_manager, + cfg, + norm_factors, + ) if i == 0: total_metrics = metrics @@ -700,6 +745,7 @@ def main(cfg: DictConfig): epoch, cfg, dist_manager, + norm_factors, scaler, ) end_time = time.time() @@ -717,6 +763,7 @@ def main(cfg: DictConfig): epoch, cfg, dist_manager, + norm_factors, ) end_time = time.time() val_duration = end_time - start_time @@ -736,7 +783,7 @@ def main(cfg: DictConfig): logger.info("Training completed!") -@hydra.main(version_base=None, config_path="conf", config_name="train_surface") +@hydra.main(version_base=None, config_path="conf", config_name="train_volumeX") def launch(cfg: DictConfig): """Launch training with hydra configuration diff --git a/examples/cfd/external_aerodynamics/transolver/src/volume_fields_normalization.npz b/examples/cfd/external_aerodynamics/transolver/src/volume_fields_normalization.npz new file mode 100644 index 0000000000000000000000000000000000000000..c1f0e6f463f1a4efe45af83a2eca8e0ff0c53802 GIT binary patch literal 1056 zcmWIWW@gc4fB;2?4`=kB|Azt&1`&qb)Wkf!yn;$b1_6dCP*pH`vR|lgKqMnW8AG*t zN@{U(k-C+Fx=osix{iW+T7FSUQDR64Li^`#U&|&)iWmrPT4i?^s-Z` zkhXgjsACt;HWSS=ryjhtM)pi@=Aez4FWg)9UzQO+uuh8c0NW<3gZ%HbW}wDQ8znn1|_yQbMd%b torch.Tensor: return slice_tokens +class GeoConvOut(nn.Module): + """ + Geometry layer to project STL geometry data onto regular grids. + """ + + def __init__( + self, + input_features: int, + neighbors_in_radius: int, + base_neurons: int, + fourier_features: bool, + num_modes: int, + ): + """ + Initialize the GeoConvOut layer. + + Args: + input_features: Number of input feature dimensions + neighbors_in_radius: Number of neighbors in radius + """ + super().__init__() + self.base_neurons = base_neurons + self.fourier_features = fourier_features + self.num_modes = num_modes + + if self.fourier_features: + input_features_calculated = ( + input_features * (1 + 2 * self.num_modes) * neighbors_in_radius + ) + else: + input_features_calculated = input_features * neighbors_in_radius + + self.mlp = Mlp( + in_features=input_features_calculated, + hidden_features=[base_neurons, base_neurons // 2], + out_features=base_neurons, + act_layer=nn.GELU, + drop=0.0, + ) + + self.activation = nn.GELU + + self.neighbors_in_radius = neighbors_in_radius + + if self.fourier_features: + self.register_buffer( + "freqs", torch.exp(torch.linspace(0, math.pi, self.num_modes)) + ) + + def forward( + self, + x: torch.Tensor, + ) -> torch.Tensor: + """ + Process and project geometric features onto a 3D grid. + + Args: + x: Input tensor containing coordinates of the neighboring points + (batch_size, n_points, n_neighbors, 3) + + Returns: + Processed geometry features of shape (batch_size, n_points, n_neighbors, base_neurons) + """ + + b, n_points, n_neighbors, c = x.shape + x = rearrange( + x, "b x y z -> b x (y z)", x=n_points, y=n_neighbors, z=c + ) + if self.fourier_features: + facets = torch.cat((x, fourier_encode(x, self.freqs)), axis=-1) + else: + facets = x + + x = F.tanh(self.mlp(facets)) + + return x + class TransolverX(Module): """ Transolver model, adapted from original transolver code. @@ -512,6 +591,14 @@ class TransolverX(Module): Whether to include time embeddings. Default is false plus: bool Use Transolver++ implementation in the Physics Attention layers. + include_local_features: bool + Whether to include local features. Default is false + radii: list[float] + The radii for the local features. Default is [0.05, 0.25] + neighbors_in_radius: list[int] + The neighbors in radius for the local features. Default is [8, 32] + n_hidden_local: int + The hidden dimension for the local features. Default is 512 """ @@ -531,10 +618,16 @@ def __init__( use_te: bool = True, time_input: bool = False, plus: bool = False, + include_local_features: bool = False, + radii: list[float] = [0.05, 0.25], + neighbors_in_radius: list[int] = [8, 32], + n_hidden_local: int = 512, ) -> None: super().__init__(meta=MetaData()) self.__name__ = "Transolver" + self.include_local_features = include_local_features + self.use_te = use_te # Check that the hidden dimension and head dimensions are compatible: if not n_hidden % n_head == 0: @@ -544,7 +637,40 @@ def __init__( # These are to project geometry embeddings and global embeddings onto # a physical state space: + context_dim = 0 + if geometry_dim is not None and self.include_local_features: + self.radii = [0.05, 0.25] + self.neighbors_in_radius = [8, 32] + self.bq_warp = nn.ModuleList() + self.geo_conv_out = nn.ModuleList() + self.geometry_features_tokenizer = nn.ModuleList() + + for h in range(len(self.radii)): + self.bq_warp.append(BQWarp( + radius=radii[h], + neighbors_in_radius=neighbors_in_radius[h], + )) + + self.geo_conv_out.append(GeoConvOut( + input_features=geometry_dim, + neighbors_in_radius=neighbors_in_radius[h], + base_neurons=n_hidden_local, + fourier_features=False, + num_modes=1, + )) + + self.geometry_features_tokenizer.append(ContextProjector( + n_hidden_local, + n_head, + n_hidden // n_head, + dropout, + slice_num, + use_te, + plus, + )) + context_dim += n_hidden // n_head + if geometry_dim is not None: self.geometry_tokenizer = ContextProjector( geometry_dim, @@ -644,6 +770,12 @@ def forward( global_context_input = [] if geometry is not None: + if self.include_local_features: + for h in range(len(self.radii)): + mapping, k_short = self.bq_warp[h](geometry, geometry) + geometry_features = self.geo_conv_out[h](k_short) + geometry_states = self.geometry_features_tokenizer[h](geometry_features) + global_context_input.append(geometry_states) geometry_states = self.geometry_tokenizer(geometry) global_context_input.append(geometry_states)