use a shared strategy in conftest.py

wenyi-guo · wenyi-guo · commit 58c78978a7ed · 2025-11-26T06:50:26.000Z
diff --git a/conftest.py b/conftest.py
@@ -0,0 +1,16 @@
+import pytest
+import os
+from keras_rs.src.utils import tpu_test_utils
+
+@pytest.fixture(scope="session", autouse=True)
+def prime_shared_tpu_strategy(request):
+    """
+    Eagerly initializes the shared TPU strategy at the beginning of the session
+    if running on a TPU. This helps catch initialization errors early.
+    """
+    strategy = tpu_test_utils.get_shared_tpu_strategy()
+    if not strategy:
+        pytest.fail(
+            "Failed to initialize shared TPUStrategy for the test session. "
+            "Check logs for details from create_tpu_strategy."
+        )
diff --git a/keras_rs/src/layers/embedding/distributed_embedding_test.py b/keras_rs/src/layers/embedding/distributed_embedding_test.py
@@ -53,7 +53,7 @@ def setUp(self):
             # FLAGS.xla_sparse_core_max_unique_ids_per_partition_per_sample = 16
 
         self.batch_size = (
-            BATCH_SIZE_PER_CORE * self._strategy.num_replicas_in_sync
+            BATCH_SIZE_PER_CORE * self.strategy.num_replicas_in_sync
         )
 
     def get_embedding_config(self, input_type, placement):
@@ -194,11 +194,11 @@ def test_basics(self, input_type, placement):
 
         if placement == "sparsecore" and not self.on_tpu:
             with self.assertRaisesRegex(Exception, "sparsecore"):
-                with self._strategy.scope():
+                with self.strategy.scope():
                     distributed_embedding.DistributedEmbedding(feature_configs)
             return
 
-        with self._strategy.scope():
+        with self.strategy.scope():
             layer = distributed_embedding.DistributedEmbedding(feature_configs)
 
         if keras.backend.backend() == "jax":
@@ -276,7 +276,7 @@ def test_model_fit(self, input_type, use_weights):
             (test_model_inputs, test_labels)
         )
 
-        with self._strategy.scope():
+        with self.strategy.scope():
             layer = distributed_embedding.DistributedEmbedding(feature_configs)
 
         def _create_keras_input(
@@ -347,7 +347,7 @@ def test_dataset_generator():
             # New preprocessed data removes the `weights` component.
             dataset_has_weights = False
         else:
-            train_dataset = self._strategy.experimental_distribute_dataset(
+            train_dataset = self.strategy.experimental_distribute_dataset(
                 train_dataset,
                 options=tf.distribute.InputOptions(
                     experimental_fetch_to_device=False
@@ -362,7 +362,7 @@ def test_dataset_generator():
             inputs=keras_model_inputs, outputs=keras_model_outputs
         )
 
-        with self._strategy.scope():
+        with self.strategy.scope():
             model.compile(optimizer="adam", loss="mse")
 
             model_inputs, _ = next(iter(test_dataset))
@@ -511,7 +511,7 @@ def test_correctness(
         if not use_weights:
             weights = None
 
-        with self._strategy.scope():
+        with self.strategy.scope():
             layer = distributed_embedding.DistributedEmbedding(feature_config)
 
         if keras.backend.backend() == "jax":
@@ -568,7 +568,7 @@ def test_correctness(
 
         self.assertEqual(res.shape, (self.batch_size, EMBEDDING_OUTPUT_DIM))
 
-        with self._strategy.scope():
+        with self.strategy.scope():
             tables = layer.get_embedding_tables()
 
         emb = tables["table"]
@@ -633,11 +633,11 @@ def test_shared_table(self):
             "dense", embedding_config
         )
 
-        with self._strategy.scope():
+        with self.strategy.scope():
             layer = distributed_embedding.DistributedEmbedding(embedding_config)
 
         res = tpu_test_utils.run_with_strategy(
-            self._strategy, layer.__call__, inputs
+            self.strategy, layer.__call__, inputs
         )
 
         if self.placement == "default_device":
@@ -709,11 +709,11 @@ def test_mixed_placement(self):
             "dense", embedding_config
         )
 
-        with self._strategy.scope():
+        with self.strategy.scope():
             layer = distributed_embedding.DistributedEmbedding(embedding_config)
 
         res = tpu_test_utils.run_with_strategy(
-            self._strategy, layer.__call__, inputs
+            self.strategy, layer.__call__, inputs
         )
 
         self.assertEqual(
@@ -740,22 +740,22 @@ def test_save_load_model(self):
         with tempfile.TemporaryDirectory() as temp_dir:
             path = os.path.join(temp_dir, "model.keras")
 
-            with self._strategy.scope():
+            with self.strategy.scope():
                 layer = distributed_embedding.DistributedEmbedding(
                     feature_configs
                 )
                 keras_outputs = layer(keras_inputs)
                 model = keras.Model(inputs=keras_inputs, outputs=keras_outputs)
 
                 output_before = tpu_test_utils.run_with_strategy(
-                    self._strategy, model.__call__, inputs
+                    self.strategy, model.__call__, inputs
                 )
                 model.save(path)
 
-            with self._strategy.scope():
+            with self.strategy.scope():
                 reloaded_model = keras.models.load_model(path)
                 output_after = tpu_test_utils.run_with_strategy(
-                    self._strategy, reloaded_model.__call__, inputs
+                    self.strategy, reloaded_model.__call__, inputs
                 )
 
         if self.placement == "sparsecore":
diff --git a/keras_rs/src/losses/list_mle_loss_test.py b/keras_rs/src/losses/list_mle_loss_test.py
@@ -10,6 +10,7 @@
 
 class ListMLELossTest(testing.TestCase, parameterized.TestCase):
     def setUp(self):
+        super().setUp()
         self.unbatched_scores = ops.array(
             [1.0, 3.0, 2.0, 4.0, 0.8], dtype="float32"
         )
diff --git a/keras_rs/src/losses/pairwise_hinge_loss_test.py b/keras_rs/src/losses/pairwise_hinge_loss_test.py
@@ -10,6 +10,7 @@
 
 class PairwiseHingeLossTest(testing.TestCase, parameterized.TestCase):
     def setUp(self):
+        super().setUp()
         self.unbatched_scores = ops.array([1.0, 3.0, 2.0, 4.0, 0.8])
         self.unbatched_labels = ops.array([1.0, 0.0, 1.0, 3.0, 2.0])
 
diff --git a/keras_rs/src/losses/pairwise_logistic_loss_test.py b/keras_rs/src/losses/pairwise_logistic_loss_test.py
@@ -10,6 +10,7 @@
 
 class PairwiseLogisticLossTest(testing.TestCase, parameterized.TestCase):
     def setUp(self):
+        super().setUp()
         self.unbatched_scores = ops.array([1.0, 3.0, 2.0, 4.0, 0.8])
         self.unbatched_labels = ops.array([1.0, 0.0, 1.0, 3.0, 2.0])
 
diff --git a/keras_rs/src/losses/pairwise_mean_squared_error_test.py b/keras_rs/src/losses/pairwise_mean_squared_error_test.py
@@ -12,6 +12,7 @@
 
 class PairwiseMeanSquaredErrorTest(testing.TestCase, parameterized.TestCase):
     def setUp(self):
+        super().setUp()
         self.unbatched_scores = ops.array([1.0, 3.0, 2.0, 4.0, 0.8])
         self.unbatched_labels = ops.array([1.0, 0.0, 1.0, 3.0, 2.0])
 
diff --git a/keras_rs/src/losses/pairwise_soft_zero_one_loss_test.py b/keras_rs/src/losses/pairwise_soft_zero_one_loss_test.py
@@ -12,6 +12,7 @@
 
 class PairwiseSoftZeroOneLossTest(testing.TestCase, parameterized.TestCase):
     def setUp(self):
+        super().setUp()
         self.unbatched_scores = ops.array([1.0, 3.0, 2.0, 4.0, 0.8])
         self.unbatched_labels = ops.array([1.0, 0.0, 1.0, 3.0, 2.0])
 
diff --git a/keras_rs/src/testing/test_case.py b/keras_rs/src/testing/test_case.py
@@ -1,7 +1,7 @@
 import os
 import tempfile
 import unittest
-from typing import Any
+from typing import Any, Optional, Union
 
 import keras
 import numpy as np
@@ -10,6 +10,12 @@
 from keras_rs.src import types
 from keras_rs.src.utils import tpu_test_utils
 
+StrategyType = Union[
+    tf.distribute.Strategy,
+    tpu_test_utils.DummyStrategy,
+    tpu_test_utils.JaxDummyStrategy,
+]
+
 
 class TestCase(unittest.TestCase):
     """TestCase class for all Keras Recommenders tests."""
@@ -21,22 +27,32 @@ def setUp(self) -> None:
         if keras.backend.backend() == "tensorflow":
             tf.debugging.disable_traceback_filtering()
         self.on_tpu = "TPU_NAME" in os.environ
+        self._strategy: Optional[StrategyType] = None
 
     @property
-    def strategy(self):
-        if hasattr(self, "_strategy"):
-            return self._strategy
-        self._strategy = tpu_test_utils.get_tpu_strategy(self)
-        return self._strategy
+    def strategy(self) -> StrategyType:
+        strat = tpu_test_utils.get_shared_tpu_strategy()
+
+        if strat is None:
+             # This case should ideally be caught by the conftest.py fixture
+            self.fail(
+                "TPU environment detected, but the shared TPUStrategy is None. "
+                "Initialization likely failed."
+            )
+        return strat
+        # if self._strategy is not None:
+        #     return self._strategy
+        # self._strategy = tpu_test_utils.get_tpu_strategy(self)
+        # return self._strategy
 
     def assertAllClose(
         self,
         actual: types.Tensor,
         desired: types.Tensor,
         atol: float = 1e-6,
         rtol: float = 1e-6,
-        tpu_atol=None,
-        tpu_rtol=None,
+        tpu_atol: float = None,
+        tpu_rtol: float = None,
         msg: str = "",
     ) -> None:
         """Verify that two tensors are close in value element by element.
diff --git a/keras_rs/src/utils/tpu_test_utils.py b/keras_rs/src/utils/tpu_test_utils.py
@@ -1,5 +1,6 @@
 import contextlib
 import os
+import threading
 from types import ModuleType
 from typing import Any, Callable, ContextManager, Optional, Tuple, Union
 
@@ -42,6 +43,63 @@ def num_replicas_in_sync(self) -> Any:
 
 StrategyType = Union[tf.distribute.Strategy, DummyStrategy, JaxDummyStrategy]
 
+_shared_strategy: Optional[StrategyType] = None
+_lock = threading.Lock()
+
+def create_tpu_strategy() -> Optional[StrategyType]:
+    """Initializes the TPU system and returns a TPUStrategy."""
+    print("Attempting to create TPUStrategy...")
+    try:
+        resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
+        tf.config.experimental_connect_to_cluster(resolver)
+        tf.tpu.experimental.initialize_tpu_system(resolver)
+        strategy = tf.distribute.TPUStrategy(resolver)
+        print(f"TPUStrategy created successfully. Devices: {strategy.extended.num_replicas_in_sync}")
+        return strategy
+    except Exception as e:
+        print(f"Error creating TPUStrategy: {e}")
+        return None
+
+def get_shared_tpu_strategy() -> Optional[StrategyType]:
+    """
+    Returns a session-wide shared TPUStrategy instance.
+    Creates the instance on the first call.
+    Returns None if not in a TPU environment or if creation fails.
+    """
+    global _shared_strategy
+    if _shared_strategy is not None:
+        return _shared_strategy
+
+    with _lock:
+        if _shared_strategy is None:
+            if "TPU_NAME" not in os.environ:
+                _shared_strategy = DummyStrategy()
+                return _shared_strategy
+            if keras.backend.backend() == "tensorflow":
+                resolver = tf.distribute.cluster_resolver.TPUClusterResolver()
+                tf.config.experimental_connect_to_cluster(resolver)
+                topology = tf.tpu.experimental.initialize_tpu_system(resolver)
+                tpu_metadata = resolver.get_tpu_system_metadata()
+                device_assignment = tf.tpu.experimental.DeviceAssignment.build(
+                    topology, num_replicas=tpu_metadata.num_hosts
+                )
+                _shared_strategy = tf.distribute.TPUStrategy(
+                    resolver, experimental_device_assignment=device_assignment
+                )
+                print("### num_replicas", _shared_strategy.num_replicas_in_sync)
+            elif keras.backend.backend() == "jax":
+                if jax is None:
+                    raise ImportError(
+                        "JAX backend requires jax to be installed for TPU."
+                    )
+                print("### num_replicas", jax.device_count("tpu"))
+                _shared_strategy = JaxDummyStrategy()
+            else:
+                _shared_strategy = DummyStrategy()
+            if _shared_strategy is None:
+                 print("Failed to create the shared TPUStrategy.")
+    return _shared_strategy
+
 
 def get_tpu_strategy(test_case: Any) -> StrategyType:
     """Get TPU strategy if on TPU, otherwise return DummyStrategy."""

Original file line number	Diff line number	Diff line change
`@@ -10,6 +10,7 @@`
`10`	`10`
`11`	`11`	`class ListMLELossTest(testing.TestCase, parameterized.TestCase):`
`12`	`12`	`def setUp(self):`
	`13`	`+ super().setUp()`
`13`	`14`	`self.unbatched_scores = ops.array(`
`14`	`15`	`[1.0, 3.0, 2.0, 4.0, 0.8], dtype="float32"`
`15`	`16`	`)`