ci: Add flaky marker to LTS tests

ko3n1g · ko3n1g · commit e9a0eae9a28c · 2025-11-18T11:26:50.000Z
Signed-off-by: oliver könig &lt;okoenig@nvidia.com&gt;
diff --git a/tests/unit_tests/data/test_bin_reader.py b/tests/unit_tests/data/test_bin_reader.py
@@ -142,6 +142,7 @@ def read(self, path, byte_range):
 setattr(msc, "resolve_storage_client", _msc_resolve_storage_client)
 
 
+@pytest.mark.flaky
 @pytest.mark.flaky
 @pytest.mark.flaky_in_dev
 def test_bin_reader():
diff --git a/tests/unit_tests/data/test_preprocess_data.py b/tests/unit_tests/data/test_preprocess_data.py
@@ -213,6 +213,7 @@ def bert_vocab(odir):
     return path
 
 
+@pytest.mark.flaky
 @pytest.mark.flaky
 @pytest.mark.flaky_in_dev
 def test_preprocess_data_bert():
diff --git a/tests/unit_tests/dist_checkpointing/test_flattened_resharding.py b/tests/unit_tests/dist_checkpointing/test_flattened_resharding.py
@@ -34,7 +34,8 @@ def setup_method(self, method):
     def teardown_method(self, method):
         Utils.destroy_model_parallel()
 
-    @pytest.mark.flaky_in_dev  # Issue #2854
+    @pytest.mark.flaky
+@pytest.mark.flaky_in_dev  # Issue #2854
     @pytest.mark.parametrize(
         ('src_tp_pp', 'dest_tp_pp'),
         [((2, 4), (2, 4)), ((2, 4), (2, 2)), ((2, 4), (4, 2)), ((8, 1), (1, 2))],
@@ -60,7 +61,8 @@ def test_partition_change_save_load(self, tmp_path_dist_ckpt, src_tp_pp, dest_tp
 
         Utils.destroy_model_parallel()
 
-    @pytest.mark.flaky_in_dev  # Issue #2854
+    @pytest.mark.flaky
+@pytest.mark.flaky_in_dev  # Issue #2854
     @pytest.mark.parametrize(
         ('src_tp_pp', 'dest_tp_pp', 'expected_ckpt_offsets_by_rank'),
         [
@@ -126,7 +128,8 @@ def test_reformulate_nd_flattened_tensors(
 
         Utils.destroy_model_parallel()
 
-    @pytest.mark.flaky_in_dev  # Issue #2854
+    @pytest.mark.flaky
+@pytest.mark.flaky_in_dev  # Issue #2854
     @pytest.mark.parametrize(('src_tp_pp',), [((2, 4),), ((8, 1),), ((1, 1),), ((1, 4),)])
     def test_load_tensor_metadata(self, tmp_path_dist_ckpt, src_tp_pp):
         Utils.initialize_model_parallel(*src_tp_pp, order='tp-dp-pp')
diff --git a/tests/unit_tests/dist_checkpointing/test_fully_parallel.py b/tests/unit_tests/dist_checkpointing/test_fully_parallel.py
@@ -331,7 +331,8 @@ def test_load_distribution(self, parallelize_within_dp, tmp_path_dist_ckpt):
 
     @pytest.mark.parametrize('state_dict_device', ['cpu', 'cuda'])
     @pytest.mark.flaky
-    @pytest.mark.flaky_in_dev
+    @pytest.mark.flaky
+@pytest.mark.flaky_in_dev
     def test_memory_usage(self, state_dict_device, tmp_path_dist_ckpt):
         Utils.initialize_model_parallel(2, 1)
 
diff --git a/tests/unit_tests/dist_checkpointing/test_global_metadata_reuse.py b/tests/unit_tests/dist_checkpointing/test_global_metadata_reuse.py
@@ -23,7 +23,8 @@ def setup_method(self, method):
     def teardown_method(self, method):
         Utils.destroy_model_parallel()
 
-    @pytest.mark.flaky_in_dev  # Issue #2856
+    @pytest.mark.flaky
+@pytest.mark.flaky_in_dev  # Issue #2856
     @pytest.mark.parametrize(('tp,pp'), [(2, 4)])
     def test_global_metadata_reuse(self, tmp_path_dist_ckpt, tp, pp):
         Utils.initialize_model_parallel(tp, pp)
@@ -94,7 +95,8 @@ def test_global_metadata_reuse(self, tmp_path_dist_ckpt, tp, pp):
 
             assert resume_ckpt_context['save_strategy'].validated_loaded_metadata_reuse
 
-    @pytest.mark.flaky_in_dev  # Issue #2856
+    @pytest.mark.flaky
+@pytest.mark.flaky_in_dev  # Issue #2856
     @pytest.mark.parametrize(('tp,pp'), [(2, 4)])
     def test_no_global_metadata_reuse_on_different_parallelism(self, tmp_path_dist_ckpt, tp, pp):
         Utils.initialize_model_parallel(tp, pp)
diff --git a/tests/unit_tests/dist_checkpointing/test_local.py b/tests/unit_tests/dist_checkpointing/test_local.py
@@ -255,7 +255,8 @@ def deterministic_empty(*args, **kwargs):
     @pytest.mark.parametrize(('use_ramdisk'), [True, False])
     @pytest.mark.parametrize(('async_save'), [True, False])
     @pytest.mark.parametrize(('algo'), ['atomic', 'fully_parallel'])
-    @pytest.mark.flaky_in_dev
+    @pytest.mark.flaky
+@pytest.mark.flaky_in_dev
     def test_failed_save(self, caplog, tmp_path_dist_ckpt, tp, pp, use_ramdisk, async_save, algo):
         Utils.initialize_model_parallel(tp, pp)
         num_floating_point_operations_so_far = 0
diff --git a/tests/unit_tests/dist_checkpointing/test_serialization.py b/tests/unit_tests/dist_checkpointing/test_serialization.py
@@ -547,7 +547,8 @@ def test_tensor_shape_mismatch(self, tmp_path_dist_ckpt):
         not is_torch_min_version("2.3.0"),
         reason="remove_sharded_tensors relies on Torch APIs introduced in v2.3.0",
     )
-    @pytest.mark.flaky_in_dev
+    @pytest.mark.flaky
+@pytest.mark.flaky_in_dev
     def test_remove_sharded_tensors(self, tmp_path_dist_ckpt):
         Utils.initialize_model_parallel(2, 4)
 
diff --git a/tests/unit_tests/distributed/test_grad_sync_with_expert_parallel.py b/tests/unit_tests/distributed/test_grad_sync_with_expert_parallel.py
@@ -109,6 +109,7 @@ def get_moe_model_and_buffers(
 @pytest.mark.parametrize("etp_size", [1, 2])
 @pytest.mark.parametrize("num_distributed_optimizer_instances", [1, 2])
 @pytest.mark.flaky
+@pytest.mark.flaky
 @pytest.mark.flaky_in_dev
 def test_grad_sync(
     use_distributed_optimizer: bool,
diff --git a/tests/unit_tests/distributed/test_mcore_fully_sharded_data_parallel.py b/tests/unit_tests/distributed/test_mcore_fully_sharded_data_parallel.py
@@ -20,6 +20,7 @@
 
 
 # Test model for testing FSDP
+@pytest.mark.flaky
 @pytest.mark.flaky_in_dev
 class TestModel(torch.nn.Module):
     def __init__(self, input_dim, output_dim):
@@ -36,6 +37,7 @@ def forward(self, x):
 
 
 # Test model with uniform shaped weights for testing FSDP
+@pytest.mark.flaky
 @pytest.mark.flaky_in_dev
 class TestModelUniform(torch.nn.Module):
     def __init__(self, hidden_dim):
@@ -67,6 +69,7 @@ def setup_seed(seed):
     torch.backends.cudnn.benchmark = False  # Disable auto-tuner for reproducibility
 
 
+@pytest.mark.flaky
 @pytest.mark.flaky_in_dev
 class TestFullyShardedDataParallel:
     @classmethod
diff --git a/tests/unit_tests/models/test_bert_model.py b/tests/unit_tests/models/test_bert_model.py
@@ -167,7 +167,8 @@ def test_transformer_engine_version_1_7_to_1_10_flash_attn(self, mocker):
         ), f"Expected b11s for attn_mask_dimensions but got {attn_mask_dimensions}"
 
     @pytest.mark.internal
-    @pytest.mark.flaky_in_dev
+    @pytest.mark.flaky
+@pytest.mark.flaky_in_dev
     def test_transformer_engine_version_1_7_to_1_10_rng_error(self, mocker):
         bert_layer_with_transformer_engine_spec.submodules.self_attention.params[
             'attn_mask_type'
diff --git a/tests/unit_tests/models/test_t5_model.py b/tests/unit_tests/models/test_t5_model.py
@@ -103,7 +103,8 @@ def test_set_input_tensor(self):
             assert self.t5_model.encoder_hidden_state.shape[1] == micro_batch_size
             assert self.t5_model.encoder_hidden_state.shape[2] == config.hidden_size
 
-    @pytest.mark.flaky_in_dev
+    @pytest.mark.flaky
+@pytest.mark.flaky_in_dev
     def test_post_process_forward(self):
         pass
 
diff --git a/tests/unit_tests/test_parallel_state.py b/tests/unit_tests/test_parallel_state.py
@@ -10,6 +10,7 @@
 
 
 @pytest.mark.parametrize('order', test_parallel_order)
+@pytest.mark.flaky
 @pytest.mark.flaky_in_dev
 def test_initialize_and_destroy_model_parallel(order):
     with pytest.raises(AssertionError):
@@ -235,6 +236,7 @@ def test_different_initialize_order_consistency(src_tp_pp, ep_size):
     [((1, 2), 1), ((1, 4), 1), ((2, 2), 1), ((1, 2), 2), ((1, 4), 2), ((2, 2), 2)],
 )
 @pytest.mark.flaky
+@pytest.mark.flaky
 @pytest.mark.flaky_in_dev
 def test_different_initialize_order_unconsistency(src_tp_pp, ep_size):
     Utils.initialize_model_parallel(
diff --git a/tests/unit_tests/test_utils.py b/tests/unit_tests/test_utils.py
@@ -182,6 +182,7 @@ def nvtx_decorated_function_with_message():
     assert all(execution_tracker.values())
 
 
+@pytest.mark.flaky
 @pytest.mark.flaky_in_dev
 def test_check_param_hashes_across_dp_replicas():
     world = int(os.getenv('WORLD_SIZE', '1'))
@@ -207,6 +208,7 @@ def test_check_param_hashes_across_dp_replicas():
     _deinit_distributed()
 
 
+@pytest.mark.flaky
 @pytest.mark.flaky_in_dev
 def test_cross_check_param_hashes_across_dp_replicas():
     world = int(os.getenv('WORLD_SIZE', '1'))
@@ -231,6 +233,7 @@ def test_cross_check_param_hashes_across_dp_replicas():
 
 
 @pytest.mark.parametrize("use_distributed_optimizer", [False, True])
+@pytest.mark.flaky
 @pytest.mark.flaky_in_dev
 @pytest.mark.internal
 def test_param_norm(use_distributed_optimizer: bool):
@@ -281,6 +284,7 @@ def test_param_norm(use_distributed_optimizer: bool):
     _deinit_distributed()
 
 
+@pytest.mark.flaky
 @pytest.mark.flaky_in_dev
 def test_straggler_detector():
     world = int(os.getenv('WORLD_SIZE', '1'))
diff --git a/tests/unit_tests/transformer/test_retro_attention.py b/tests/unit_tests/transformer/test_retro_attention.py
@@ -196,7 +196,8 @@ def run_gpu_forward(self, recompute_granularity, use_transformer_engine):
             config.hidden_size,
         )
 
-    @pytest.mark.flaky_in_dev
+    @pytest.mark.flaky
+@pytest.mark.flaky_in_dev
     def test_gpu_forward(self):
         for recompute_granularity in (None, 'selective'):
             for use_transformer_engine in (True, False):

Original file line number	Diff line number	Diff line change
`@@ -547,7 +547,8 @@ def test_tensor_shape_mismatch(self, tmp_path_dist_ckpt):`
`547`	`547`	`not is_torch_min_version("2.3.0"),`
`548`	`548`	`reason="remove_sharded_tensors relies on Torch APIs introduced in v2.3.0",`
`549`	`549`	`)`
`550`		`- @pytest.mark.flaky_in_dev`
	`550`	`+ @pytest.mark.flaky`
	`551`	`+@pytest.mark.flaky_in_dev`
`551`	`552`	`def test_remove_sharded_tensors(self, tmp_path_dist_ckpt):`
`552`	`553`	`Utils.initialize_model_parallel(2, 4)`
`553`	`554`