Skip to content

Commit 5e3fa28

Browse files
authored
ci: Add flaky marker to LTS tests (#2290)
Signed-off-by: oliver könig <[email protected]>
1 parent dcd3b39 commit 5e3fa28

File tree

10 files changed

+22
-0
lines changed

10 files changed

+22
-0
lines changed

tests/unit_tests/dist_checkpointing/test_flattened_resharding.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ def setup_method(self, method):
3434
def teardown_method(self, method):
3535
Utils.destroy_model_parallel()
3636

37+
@pytest.mark.flaky
3738
@pytest.mark.flaky_in_dev # Issue #2854
3839
@pytest.mark.parametrize(
3940
('src_tp_pp', 'dest_tp_pp'),
@@ -60,6 +61,7 @@ def test_partition_change_save_load(self, tmp_path_dist_ckpt, src_tp_pp, dest_tp
6061

6162
Utils.destroy_model_parallel()
6263

64+
@pytest.mark.flaky
6365
@pytest.mark.flaky_in_dev # Issue #2854
6466
@pytest.mark.parametrize(
6567
('src_tp_pp', 'dest_tp_pp', 'expected_ckpt_offsets_by_rank'),
@@ -126,6 +128,7 @@ def test_reformulate_nd_flattened_tensors(
126128

127129
Utils.destroy_model_parallel()
128130

131+
@pytest.mark.flaky
129132
@pytest.mark.flaky_in_dev # Issue #2854
130133
@pytest.mark.parametrize(('src_tp_pp',), [((2, 4),), ((8, 1),), ((1, 1),), ((1, 4),)])
131134
def test_load_tensor_metadata(self, tmp_path_dist_ckpt, src_tp_pp):

tests/unit_tests/dist_checkpointing/test_global_metadata_reuse.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ def setup_method(self, method):
2323
def teardown_method(self, method):
2424
Utils.destroy_model_parallel()
2525

26+
@pytest.mark.flaky
2627
@pytest.mark.flaky_in_dev # Issue #2856
2728
@pytest.mark.parametrize(('tp,pp'), [(2, 4)])
2829
def test_global_metadata_reuse(self, tmp_path_dist_ckpt, tp, pp):
@@ -94,6 +95,7 @@ def test_global_metadata_reuse(self, tmp_path_dist_ckpt, tp, pp):
9495

9596
assert resume_ckpt_context['save_strategy'].validated_loaded_metadata_reuse
9697

98+
@pytest.mark.flaky
9799
@pytest.mark.flaky_in_dev # Issue #2856
98100
@pytest.mark.parametrize(('tp,pp'), [(2, 4)])
99101
def test_no_global_metadata_reuse_on_different_parallelism(self, tmp_path_dist_ckpt, tp, pp):

tests/unit_tests/dist_checkpointing/test_local.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -255,6 +255,7 @@ def deterministic_empty(*args, **kwargs):
255255
@pytest.mark.parametrize(('use_ramdisk'), [True, False])
256256
@pytest.mark.parametrize(('async_save'), [True, False])
257257
@pytest.mark.parametrize(('algo'), ['atomic', 'fully_parallel'])
258+
@pytest.mark.flaky
258259
@pytest.mark.flaky_in_dev
259260
def test_failed_save(self, caplog, tmp_path_dist_ckpt, tp, pp, use_ramdisk, async_save, algo):
260261
Utils.initialize_model_parallel(tp, pp)

tests/unit_tests/dist_checkpointing/test_serialization.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -547,6 +547,7 @@ def test_tensor_shape_mismatch(self, tmp_path_dist_ckpt):
547547
not is_torch_min_version("2.3.0"),
548548
reason="remove_sharded_tensors relies on Torch APIs introduced in v2.3.0",
549549
)
550+
@pytest.mark.flaky
550551
@pytest.mark.flaky_in_dev
551552
def test_remove_sharded_tensors(self, tmp_path_dist_ckpt):
552553
Utils.initialize_model_parallel(2, 4)

tests/unit_tests/distributed/test_mcore_fully_sharded_data_parallel.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020

2121

2222
# Test model for testing FSDP
23+
@pytest.mark.flaky
2324
@pytest.mark.flaky_in_dev
2425
class TestModel(torch.nn.Module):
2526
def __init__(self, input_dim, output_dim):
@@ -36,6 +37,7 @@ def forward(self, x):
3637

3738

3839
# Test model with uniform shaped weights for testing FSDP
40+
@pytest.mark.flaky
3941
@pytest.mark.flaky_in_dev
4042
class TestModelUniform(torch.nn.Module):
4143
def __init__(self, hidden_dim):
@@ -67,6 +69,7 @@ def setup_seed(seed):
6769
torch.backends.cudnn.benchmark = False # Disable auto-tuner for reproducibility
6870

6971

72+
@pytest.mark.flaky
7073
@pytest.mark.flaky_in_dev
7174
class TestFullyShardedDataParallel:
7275
@classmethod

tests/unit_tests/models/test_bert_model.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -167,6 +167,7 @@ def test_transformer_engine_version_1_7_to_1_10_flash_attn(self, mocker):
167167
), f"Expected b11s for attn_mask_dimensions but got {attn_mask_dimensions}"
168168

169169
@pytest.mark.internal
170+
@pytest.mark.flaky
170171
@pytest.mark.flaky_in_dev
171172
def test_transformer_engine_version_1_7_to_1_10_rng_error(self, mocker):
172173
bert_layer_with_transformer_engine_spec.submodules.self_attention.params[

tests/unit_tests/models/test_t5_model.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,7 @@ def test_set_input_tensor(self):
103103
assert self.t5_model.encoder_hidden_state.shape[1] == micro_batch_size
104104
assert self.t5_model.encoder_hidden_state.shape[2] == config.hidden_size
105105

106+
@pytest.mark.flaky
106107
@pytest.mark.flaky_in_dev
107108
def test_post_process_forward(self):
108109
pass

tests/unit_tests/test_parallel_state.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2+
13
import pytest
24
import torch
35

@@ -10,6 +12,7 @@
1012

1113

1214
@pytest.mark.parametrize('order', test_parallel_order)
15+
@pytest.mark.flaky
1316
@pytest.mark.flaky_in_dev
1417
def test_initialize_and_destroy_model_parallel(order):
1518
with pytest.raises(AssertionError):

tests/unit_tests/test_utils.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2+
13
import os
24
import time
35
import urllib.request as req
@@ -182,6 +184,7 @@ def nvtx_decorated_function_with_message():
182184
assert all(execution_tracker.values())
183185

184186

187+
@pytest.mark.flaky
185188
@pytest.mark.flaky_in_dev
186189
def test_check_param_hashes_across_dp_replicas():
187190
world = int(os.getenv('WORLD_SIZE', '1'))
@@ -207,6 +210,7 @@ def test_check_param_hashes_across_dp_replicas():
207210
_deinit_distributed()
208211

209212

213+
@pytest.mark.flaky
210214
@pytest.mark.flaky_in_dev
211215
def test_cross_check_param_hashes_across_dp_replicas():
212216
world = int(os.getenv('WORLD_SIZE', '1'))
@@ -231,6 +235,7 @@ def test_cross_check_param_hashes_across_dp_replicas():
231235

232236

233237
@pytest.mark.parametrize("use_distributed_optimizer", [False, True])
238+
@pytest.mark.flaky
234239
@pytest.mark.flaky_in_dev
235240
@pytest.mark.internal
236241
def test_param_norm(use_distributed_optimizer: bool):
@@ -281,6 +286,7 @@ def test_param_norm(use_distributed_optimizer: bool):
281286
_deinit_distributed()
282287

283288

289+
@pytest.mark.flaky
284290
@pytest.mark.flaky_in_dev
285291
def test_straggler_detector():
286292
world = int(os.getenv('WORLD_SIZE', '1'))

tests/unit_tests/transformer/test_retro_attention.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,7 @@ def run_gpu_forward(self, recompute_granularity, use_transformer_engine):
196196
config.hidden_size,
197197
)
198198

199+
@pytest.mark.flaky
199200
@pytest.mark.flaky_in_dev
200201
def test_gpu_forward(self):
201202
for recompute_granularity in (None, 'selective'):

0 commit comments

Comments
 (0)