GridTools · tehrengruber · Apr 9, 2026 · Apr 10, 2026 · Apr 10, 2026 · Apr 15, 2026
diff --git a/docs/development/dace_codegen_reproducability.md b/docs/development/dace_codegen_reproducability.md
@@ -0,0 +1,95 @@
+# Debugging indeterministic behavior of dace transformations
+
+- Enable printing each transformation step, e.g. using
+  ```
+  dace.Config.set("progress", value=True)
+  ```
+  TODO: introduce new config var that prints the hash instead of hard-coding it.
+- Execute the program in question twice and compare the output.
+- Set a conditonal breakpoint in beginning of the `apply` method of the first pass where the SDFG
+  hash changes with condition `sdfg.hash_sdfg() == <last equal hash>`.
+  Note: In case running the previous passes takes a long time it makes sense to serialize the SDFG
+  to json (`sdfg.to_json("sdfg(1|2).json")`) and loading it again (see debug script below) to
+  ease debugging. In rare cases the serializing and deserializing the sdfg changes the hash. In such
+  cases this trick doesn't work and the first location where the hash changes might not be the exact
+  location where the indeterministic behavior is. It helps to use a different hash, e.g.
+  `content_hash`, but this should be solved in general.
+  Note: It makes sense to also place a breakpoint after `DaceTranslator.generate_sdfg` to recognize
+  when all executions finished.
+- When the location is found it is usually easy to spot the origin of the indeterminism. Often
+  there is a set operation or a symbol is named in an indeterministic way. Use ordered sets and
+  deterministic symbol names.
+
+## Appendix
+
+__Debugging sdfg autooptimize__
+
+Usage `python debug_auto_optimize_sdfg.py sdfg1.json`
+
+```python
+import pickle
+import sys
+
+import dace
+import json
+
+from dace import SDFG
+
+from gt4py.next.program_processors.runners.dace import (
+    lowering as gtx_dace_lowering,
+    sdfg_args as gtx_dace_args,
+    transformations as gtx_transformations,
+)
+from dace.utils import print_sdfg_hash
+
+file = sys.argv[1]
+
+with open(file) as f:
+    data = json.load(f)
+    sdfg = dace.SDFG.from_json(data)
+    print_sdfg_hash(sdfg)
+
+    gtx_transformations.gt_auto_optimize(
+        sdfg,
+        gpu=False,
+        constant_symbols={},
+        unit_strides_kind=None,
+    )
+```
+
+__Debugging single sdfg transform__
+
+Usage `python debug_single_sdfg_transform.py sdfg1.json`
+
+```python
+import pickle
+import sys
+
+import dace
+import json
+
+from dace import SDFG
+
+from gt4py.next.program_processors.runners.dace import (
+    lowering as gtx_dace_lowering,
+    sdfg_args as gtx_dace_args,
+    transformations as gtx_transformations,
+)
+from dace.utils import print_sdfg_hash
+
+transformation = gtx_transformations.MoveDataflowIntoIfBody
+file = sys.argv[1]
+
+with open(file) as f:
+    data = json.load(f)
+    sdfg = dace.SDFG.from_json(data)
+    print_sdfg_hash(sdfg)
+
+    sdfg.apply_transformations_repeated(
+        transformation(
+            ignore_upstream_blocks=False,
+        ),
+        validate=False,
+        validate_all=True,
+    )
+```
diff --git a/pyproject.toml b/pyproject.toml
@@ -120,7 +120,8 @@ dependencies = [
   'toolz>=0.12.1',
   'typing-extensions>=4.12.0',
   'versioningit>=3.1.1',
-  'xxhash>=3.5.0'
+  'xxhash>=3.5.0',
+  'ordered-set>=4.0.0'
 ]
 description = 'Python library for generating high-performance implementations of stencil kernels for weather and climate modeling from a domain-specific language (DSL)'
 dynamic = ['version']

diff --git a/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py b/src/gt4py/next/program_processors/runners/dace/transformations/auto_optimize.py
@@ -18,8 +18,9 @@
 from dace.transformation import dataflow as dace_dataflow
 from dace.transformation.auto import auto_optimize as dace_aoptimize
 from dace.transformation.passes import analysis as dace_analysis
+from dace.utils import print_sdfg_hash
 
-from gt4py.next import common as gtx_common, config as gtx_config
+from gt4py.next import common as gtx_common, config as gtx_config, utils as gtx_utils
 from gt4py.next.program_processors.runners.dace import transformations as gtx_transformations
 
 
@@ -231,6 +232,7 @@ def gt_auto_optimize(
             Something along the line "Fuse if operational intensity goes up, but
             not if we have too much internal space (register pressure).
     """
+    uids = gtx_utils.IDGeneratorPool()
     device = dace.DeviceType.GPU if gpu else dace.DeviceType.CPU
     optimization_hooks = optimization_hooks or {}
 
@@ -255,12 +257,14 @@ def gt_auto_optimize(
         # Initial Cleanup
         # NOTE: The initial simplification stage must be synchronized with the one that
         #   `gt_substitute_compiletime_symbols()` performs!
+        print_sdfg_hash(sdfg)
         gtx_transformations.gt_simplify(
             sdfg=sdfg,
             validate=False,
             skip=gtx_transformations.constants._GT_AUTO_OPT_INITIAL_STEP_SIMPLIFY_SKIP_LIST,
             validate_all=validate_all,
         )
+        print_sdfg_hash(sdfg)
 
         if constant_symbols:
             gtx_transformations.gt_substitute_compiletime_symbols(
@@ -272,6 +276,7 @@ def gt_auto_optimize(
                 validate=False,
                 validate_all=validate_all,
             )
+        print_sdfg_hash(sdfg)
 
         # Demote the fields.
         #  Actually they should probably be at the very start of this function, however,
@@ -305,10 +310,12 @@ def gt_auto_optimize(
                     skip=gtx_transformations.constants._GT_AUTO_OPT_INITIAL_STEP_SIMPLIFY_SKIP_LIST,
                     validate_all=validate_all,
                 )
+        print_sdfg_hash(sdfg)
 
         gtx_transformations.gt_reduce_distributed_buffering(
             sdfg, validate=False, validate_all=validate_all
         )
+        print_sdfg_hash(sdfg)
 
         # Process top level Maps
         sdfg = _gt_auto_process_top_level_maps(
@@ -318,12 +325,14 @@ def gt_auto_optimize(
             optimization_hooks=optimization_hooks,
             validate_all=validate_all,
         )
+        print_sdfg_hash(sdfg)
 
         # We now ensure that point wise computations are properly double buffered,
         #  this ensures that rule 3 of ADR-18 is maintained.
         # TODO(phimuell): Figuring out if it is important to do it before the inner
         #   Map optimization. I think it is, especially when we apply `LoopBlocking`.
         gtx_transformations.gt_create_local_double_buffering(sdfg)
+        print_sdfg_hash(sdfg)
 
         # Optimize the interior of the Maps:
         sdfg = _gt_auto_process_dataflow_inside_maps(
@@ -335,7 +344,9 @@ def gt_auto_optimize(
             scan_loop_unrolling_factor=scan_loop_unrolling_factor,
             fuse_tasklets=fuse_tasklets,
             validate_all=validate_all,
+            uids=uids,
         )
+        print_sdfg_hash(sdfg)
 
         # Configure the Maps:
         #  Will also perform the GPU transformation.
@@ -386,6 +397,7 @@ def gt_auto_optimize(
             gpu_block_size_spec=gpu_block_size_spec if gpu_block_size_spec else None,
             validate_all=validate_all,
         )
+        print_sdfg_hash(sdfg)
 
         # Transients
         sdfg = _gt_auto_post_processing(
@@ -400,6 +412,7 @@ def gt_auto_optimize(
             gpu_memory_pool=gpu_memory_pool,
             validate_all=validate_all,
         )
+        print_sdfg_hash(sdfg)
 
         # Canonicalize the SDFG. This ensures that the code generator will see SDFGs
         #  that conform to the historical expected version.
@@ -683,6 +696,7 @@ def _gt_auto_process_dataflow_inside_maps(
     scan_loop_unrolling_factor: int,
     fuse_tasklets: bool,
     validate_all: bool,
+    uids: gtx_utils.IDGeneratorPool
 ) -> dace.SDFG:
     """Optimizes the dataflow inside the top level Maps of the SDFG inplace.
 
@@ -754,8 +768,9 @@ def _gt_auto_process_dataflow_inside_maps(
 
     # Make sure that this runs before MoveDataflowIntoIfBody because atm it doesn't handle
     # NestedSDFGs inside the ConditionalBlocks it fuses.
+    sdfg.save("/home/tille/Development/icon4py/graupel_before_fuse_horizontal_condblocks.json")
     sdfg.apply_transformations_repeated(
-        gtx_transformations.FuseHorizontalConditionBlocks(),
+        gtx_transformations.FuseHorizontalConditionBlocks(uids=uids),
         validate=True,
         validate_all=True,
     )

diff --git a/...y/next/program_processors/runners/dace/transformations/fuse_horizontal_conditionblocks.py b/...y/next/program_processors/runners/dace/transformations/fuse_horizontal_conditionblocks.py
@@ -14,6 +14,7 @@
 from dace.sdfg import graph as dace_graph, nodes as dace_nodes
 from dace.transformation import helpers as dace_helpers
 
+from gt4py.next import utils as gtx_utils
 from gt4py.next.program_processors.runners.dace import transformations as gtx_transformations
 
 
@@ -50,6 +51,10 @@ class FuseHorizontalConditionBlocks(dace_transformation.SingleStateTransformatio
     conditional_access_node = dace_transformation.PatternNode(dace_nodes.AccessNode)
     nsdfg_a = dace_transformation.PatternNode(dace_nodes.NestedSDFG)
     nsdfg_b = dace_transformation.PatternNode(dace_nodes.NestedSDFG)
+    uids = dace_properties.Property(dtype=gtx_utils.IDGeneratorPool)
+
+    def __init__(self, *args, uids: gtx_utils.IDGeneratorPool, **kwargs):
+        super().__init__(*args, **kwargs)
 
     # The fusion of the two conditional blocks can happen in any order. To avoid any indeterminism distinguish which one is the fused and which one is the extended conditional block which will include the fused one.
     @staticmethod
@@ -293,7 +298,7 @@ def apply(
         for data_name, data_desc in fused_conditional_block.sdfg.arrays.items():
             if data_name == "__cond":
                 continue
-            new_data_name = gtx_transformations.utils.unique_name(data_name) + "_from_cb_fusion"
+            new_data_name = next(self.uids[f"{data_name}_cb_fusion"])
             data_desc_renamed = copy.deepcopy(data_desc)
             second_arrays_rename_map[data_name] = (
                 nested_sdfg_of_extended_conditional_block.sdfg.add_datadesc(

diff --git a/src/gt4py/next/program_processors/runners/dace/transformations/map_fusion_utils.py b/src/gt4py/next/program_processors/runners/dace/transformations/map_fusion_utils.py
@@ -15,6 +15,7 @@
 import dace
 from dace import subsets as dace_subsets
 from dace.sdfg import nodes as dace_nodes
+from ordered_set import OrderedSet
 
 from gt4py.next.program_processors.runners.dace.transformations import (
     splitting_tools as gtx_dace_split,
@@ -78,8 +79,11 @@ def _new_name(old_name: str) -> str:
         elif isinstance(node, dace_nodes.NestedSDFG):
             node_ = graph.add_nested_sdfg(
                 sdfg=copy.deepcopy(node.sdfg),
-                inputs=set(node.in_connectors.keys()),
-                outputs=set(node.out_connectors.keys()),
+                # TODO(tehrengruber): What is the performance optimization from Philip about?
+                #  In any case this here leads to an sdfg in which the order in graph.nodes
+                #  is indeterministic, but to_json, then from_json restores it again.
+                inputs={k: None for k in node.in_connectors.keys()},
+                outputs={k: None for k in node.out_connectors.keys()},
-                inputs={k: None for k in node.in_connectors.keys()},
-                outputs={k: None for k in node.out_connectors.keys()},
+                inputs=node.in_connectors.copy().
+                outputs=node.out_connectors.copy(),
-                inputs={k: None for k in node.in_connectors.keys()},
-                outputs={k: None for k in node.out_connectors.keys()},
+                inputs=node.in_connectors.copy().
+                outputs=node.out_connectors.copy(),
                 symbol_mapping=node.symbol_mapping.copy(),
                 debuginfo=copy.copy(node.debuginfo),
             )
@@ -202,8 +206,10 @@ def split_overlapping_map_range(
         Two lists, each containing the ranges corresponding to the splitted range
         for the first and the second map, respectively.
     """
-    first_map_params = set(first_map.params)
-    second_map_params = set(second_map.params)
+    # TODO(tehrengruber): The structure here looks a little funky. We just use an ordered set for
+    #  now, but likely no sets are needed at all.
+    first_map_params = OrderedSet(first_map.params)
+    second_map_params = OrderedSet(second_map.params)
     if first_map_params != second_map_params:
         return None