ci: Add flaky marker to LTS tests (#2290)

ko3n1g · web-flow · commit 5e3fa28ef06e · 2025-11-18T14:09:37.000Z
Signed-off-by: oliver könig &lt;okoenig@nvidia.com&gt;
diff --git a/tests/unit_tests/dist_checkpointing/test_flattened_resharding.py b/tests/unit_tests/dist_checkpointing/test_flattened_resharding.py
@@ -34,6 +34,7 @@ def setup_method(self, method):
     def teardown_method(self, method):
         Utils.destroy_model_parallel()
 
+    @pytest.mark.flaky
     @pytest.mark.flaky_in_dev  # Issue #2854
     @pytest.mark.parametrize(
         ('src_tp_pp', 'dest_tp_pp'),
@@ -60,6 +61,7 @@ def test_partition_change_save_load(self, tmp_path_dist_ckpt, src_tp_pp, dest_tp
 
         Utils.destroy_model_parallel()
 
+    @pytest.mark.flaky
     @pytest.mark.flaky_in_dev  # Issue #2854
     @pytest.mark.parametrize(
         ('src_tp_pp', 'dest_tp_pp', 'expected_ckpt_offsets_by_rank'),
@@ -126,6 +128,7 @@ def test_reformulate_nd_flattened_tensors(
 
         Utils.destroy_model_parallel()
 
+    @pytest.mark.flaky
     @pytest.mark.flaky_in_dev  # Issue #2854
     @pytest.mark.parametrize(('src_tp_pp',), [((2, 4),), ((8, 1),), ((1, 1),), ((1, 4),)])
     def test_load_tensor_metadata(self, tmp_path_dist_ckpt, src_tp_pp):
diff --git a/tests/unit_tests/dist_checkpointing/test_global_metadata_reuse.py b/tests/unit_tests/dist_checkpointing/test_global_metadata_reuse.py
@@ -23,6 +23,7 @@ def setup_method(self, method):
     def teardown_method(self, method):
         Utils.destroy_model_parallel()
 
+    @pytest.mark.flaky
     @pytest.mark.flaky_in_dev  # Issue #2856
     @pytest.mark.parametrize(('tp,pp'), [(2, 4)])
     def test_global_metadata_reuse(self, tmp_path_dist_ckpt, tp, pp):
@@ -94,6 +95,7 @@ def test_global_metadata_reuse(self, tmp_path_dist_ckpt, tp, pp):
 
             assert resume_ckpt_context['save_strategy'].validated_loaded_metadata_reuse
 
+    @pytest.mark.flaky
     @pytest.mark.flaky_in_dev  # Issue #2856
     @pytest.mark.parametrize(('tp,pp'), [(2, 4)])
     def test_no_global_metadata_reuse_on_different_parallelism(self, tmp_path_dist_ckpt, tp, pp):
diff --git a/tests/unit_tests/dist_checkpointing/test_local.py b/tests/unit_tests/dist_checkpointing/test_local.py
@@ -255,6 +255,7 @@ def deterministic_empty(*args, **kwargs):
     @pytest.mark.parametrize(('use_ramdisk'), [True, False])
     @pytest.mark.parametrize(('async_save'), [True, False])
     @pytest.mark.parametrize(('algo'), ['atomic', 'fully_parallel'])
+    @pytest.mark.flaky
     @pytest.mark.flaky_in_dev
     def test_failed_save(self, caplog, tmp_path_dist_ckpt, tp, pp, use_ramdisk, async_save, algo):
         Utils.initialize_model_parallel(tp, pp)
diff --git a/tests/unit_tests/dist_checkpointing/test_serialization.py b/tests/unit_tests/dist_checkpointing/test_serialization.py
@@ -547,6 +547,7 @@ def test_tensor_shape_mismatch(self, tmp_path_dist_ckpt):
         not is_torch_min_version("2.3.0"),
         reason="remove_sharded_tensors relies on Torch APIs introduced in v2.3.0",
     )
+    @pytest.mark.flaky
     @pytest.mark.flaky_in_dev
     def test_remove_sharded_tensors(self, tmp_path_dist_ckpt):
         Utils.initialize_model_parallel(2, 4)
diff --git a/tests/unit_tests/distributed/test_mcore_fully_sharded_data_parallel.py b/tests/unit_tests/distributed/test_mcore_fully_sharded_data_parallel.py
@@ -20,6 +20,7 @@
 
 
 # Test model for testing FSDP
+@pytest.mark.flaky
 @pytest.mark.flaky_in_dev
 class TestModel(torch.nn.Module):
     def __init__(self, input_dim, output_dim):
@@ -36,6 +37,7 @@ def forward(self, x):
 
 
 # Test model with uniform shaped weights for testing FSDP
+@pytest.mark.flaky
 @pytest.mark.flaky_in_dev
 class TestModelUniform(torch.nn.Module):
     def __init__(self, hidden_dim):
@@ -67,6 +69,7 @@ def setup_seed(seed):
     torch.backends.cudnn.benchmark = False  # Disable auto-tuner for reproducibility
 
 
+@pytest.mark.flaky
 @pytest.mark.flaky_in_dev
 class TestFullyShardedDataParallel:
     @classmethod
diff --git a/tests/unit_tests/models/test_bert_model.py b/tests/unit_tests/models/test_bert_model.py
@@ -167,6 +167,7 @@ def test_transformer_engine_version_1_7_to_1_10_flash_attn(self, mocker):
         ), f"Expected b11s for attn_mask_dimensions but got {attn_mask_dimensions}"
 
     @pytest.mark.internal
+    @pytest.mark.flaky
     @pytest.mark.flaky_in_dev
     def test_transformer_engine_version_1_7_to_1_10_rng_error(self, mocker):
         bert_layer_with_transformer_engine_spec.submodules.self_attention.params[
diff --git a/tests/unit_tests/models/test_t5_model.py b/tests/unit_tests/models/test_t5_model.py
@@ -103,6 +103,7 @@ def test_set_input_tensor(self):
             assert self.t5_model.encoder_hidden_state.shape[1] == micro_batch_size
             assert self.t5_model.encoder_hidden_state.shape[2] == config.hidden_size
 
+    @pytest.mark.flaky
     @pytest.mark.flaky_in_dev
     def test_post_process_forward(self):
         pass
diff --git a/tests/unit_tests/test_parallel_state.py b/tests/unit_tests/test_parallel_state.py
@@ -1,3 +1,5 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
 import pytest
 import torch
 
@@ -10,6 +12,7 @@
 
 
 @pytest.mark.parametrize('order', test_parallel_order)
+@pytest.mark.flaky
 @pytest.mark.flaky_in_dev
 def test_initialize_and_destroy_model_parallel(order):
     with pytest.raises(AssertionError):
diff --git a/tests/unit_tests/test_utils.py b/tests/unit_tests/test_utils.py
@@ -1,3 +1,5 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
 import os
 import time
 import urllib.request as req
@@ -182,6 +184,7 @@ def nvtx_decorated_function_with_message():
     assert all(execution_tracker.values())
 
 
+@pytest.mark.flaky
 @pytest.mark.flaky_in_dev
 def test_check_param_hashes_across_dp_replicas():
     world = int(os.getenv('WORLD_SIZE', '1'))
@@ -207,6 +210,7 @@ def test_check_param_hashes_across_dp_replicas():
     _deinit_distributed()
 
 
+@pytest.mark.flaky
 @pytest.mark.flaky_in_dev
 def test_cross_check_param_hashes_across_dp_replicas():
     world = int(os.getenv('WORLD_SIZE', '1'))
@@ -231,6 +235,7 @@ def test_cross_check_param_hashes_across_dp_replicas():
 
 
 @pytest.mark.parametrize("use_distributed_optimizer", [False, True])
+@pytest.mark.flaky
 @pytest.mark.flaky_in_dev
 @pytest.mark.internal
 def test_param_norm(use_distributed_optimizer: bool):
@@ -281,6 +286,7 @@ def test_param_norm(use_distributed_optimizer: bool):
     _deinit_distributed()
 
 
+@pytest.mark.flaky
 @pytest.mark.flaky_in_dev
 def test_straggler_detector():
     world = int(os.getenv('WORLD_SIZE', '1'))
diff --git a/tests/unit_tests/transformer/test_retro_attention.py b/tests/unit_tests/transformer/test_retro_attention.py
@@ -196,6 +196,7 @@ def run_gpu_forward(self, recompute_granularity, use_transformer_engine):
             config.hidden_size,
         )
 
+    @pytest.mark.flaky
     @pytest.mark.flaky_in_dev
     def test_gpu_forward(self):
         for recompute_granularity in (None, 'selective'):

Original file line number	Diff line number	Diff line change
`@@ -547,6 +547,7 @@ def test_tensor_shape_mismatch(self, tmp_path_dist_ckpt):`
`547`	`547`	`not is_torch_min_version("2.3.0"),`
`548`	`548`	`reason="remove_sharded_tensors relies on Torch APIs introduced in v2.3.0",`
`549`	`549`	`)`
	`550`	`+ @pytest.mark.flaky`
`550`	`551`	`@pytest.mark.flaky_in_dev`
`551`	`552`	`def test_remove_sharded_tensors(self, tmp_path_dist_ckpt):`
`552`	`553`	`Utils.initialize_model_parallel(2, 4)`
Original file line number	Diff line number	Diff line change
`@@ -196,6 +196,7 @@ def run_gpu_forward(self, recompute_granularity, use_transformer_engine):`
`196`	`196`	`config.hidden_size,`
`197`	`197`	`)`
`198`	`198`
	`199`	`+ @pytest.mark.flaky`
`199`	`200`	`@pytest.mark.flaky_in_dev`
`200`	`201`	`def test_gpu_forward(self):`
`201`	`202`	`for recompute_granularity in (None, 'selective'):`