Skip to content

Commit e9a0eae

Browse files
committed
ci: Add flaky marker to LTS tests
Signed-off-by: oliver könig <[email protected]>
1 parent e8b9df1 commit e9a0eae

14 files changed

+34
-11
lines changed

tests/unit_tests/data/test_bin_reader.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,7 @@ def read(self, path, byte_range):
142142
setattr(msc, "resolve_storage_client", _msc_resolve_storage_client)
143143

144144

145+
@pytest.mark.flaky
145146
@pytest.mark.flaky
146147
@pytest.mark.flaky_in_dev
147148
def test_bin_reader():

tests/unit_tests/data/test_preprocess_data.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -213,6 +213,7 @@ def bert_vocab(odir):
213213
return path
214214

215215

216+
@pytest.mark.flaky
216217
@pytest.mark.flaky
217218
@pytest.mark.flaky_in_dev
218219
def test_preprocess_data_bert():

tests/unit_tests/dist_checkpointing/test_flattened_resharding.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,8 @@ def setup_method(self, method):
3434
def teardown_method(self, method):
3535
Utils.destroy_model_parallel()
3636

37-
@pytest.mark.flaky_in_dev # Issue #2854
37+
@pytest.mark.flaky
38+
@pytest.mark.flaky_in_dev # Issue #2854
3839
@pytest.mark.parametrize(
3940
('src_tp_pp', 'dest_tp_pp'),
4041
[((2, 4), (2, 4)), ((2, 4), (2, 2)), ((2, 4), (4, 2)), ((8, 1), (1, 2))],
@@ -60,7 +61,8 @@ def test_partition_change_save_load(self, tmp_path_dist_ckpt, src_tp_pp, dest_tp
6061

6162
Utils.destroy_model_parallel()
6263

63-
@pytest.mark.flaky_in_dev # Issue #2854
64+
@pytest.mark.flaky
65+
@pytest.mark.flaky_in_dev # Issue #2854
6466
@pytest.mark.parametrize(
6567
('src_tp_pp', 'dest_tp_pp', 'expected_ckpt_offsets_by_rank'),
6668
[
@@ -126,7 +128,8 @@ def test_reformulate_nd_flattened_tensors(
126128

127129
Utils.destroy_model_parallel()
128130

129-
@pytest.mark.flaky_in_dev # Issue #2854
131+
@pytest.mark.flaky
132+
@pytest.mark.flaky_in_dev # Issue #2854
130133
@pytest.mark.parametrize(('src_tp_pp',), [((2, 4),), ((8, 1),), ((1, 1),), ((1, 4),)])
131134
def test_load_tensor_metadata(self, tmp_path_dist_ckpt, src_tp_pp):
132135
Utils.initialize_model_parallel(*src_tp_pp, order='tp-dp-pp')

tests/unit_tests/dist_checkpointing/test_fully_parallel.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -331,7 +331,8 @@ def test_load_distribution(self, parallelize_within_dp, tmp_path_dist_ckpt):
331331

332332
@pytest.mark.parametrize('state_dict_device', ['cpu', 'cuda'])
333333
@pytest.mark.flaky
334-
@pytest.mark.flaky_in_dev
334+
@pytest.mark.flaky
335+
@pytest.mark.flaky_in_dev
335336
def test_memory_usage(self, state_dict_device, tmp_path_dist_ckpt):
336337
Utils.initialize_model_parallel(2, 1)
337338

tests/unit_tests/dist_checkpointing/test_global_metadata_reuse.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,8 @@ def setup_method(self, method):
2323
def teardown_method(self, method):
2424
Utils.destroy_model_parallel()
2525

26-
@pytest.mark.flaky_in_dev # Issue #2856
26+
@pytest.mark.flaky
27+
@pytest.mark.flaky_in_dev # Issue #2856
2728
@pytest.mark.parametrize(('tp,pp'), [(2, 4)])
2829
def test_global_metadata_reuse(self, tmp_path_dist_ckpt, tp, pp):
2930
Utils.initialize_model_parallel(tp, pp)
@@ -94,7 +95,8 @@ def test_global_metadata_reuse(self, tmp_path_dist_ckpt, tp, pp):
9495

9596
assert resume_ckpt_context['save_strategy'].validated_loaded_metadata_reuse
9697

97-
@pytest.mark.flaky_in_dev # Issue #2856
98+
@pytest.mark.flaky
99+
@pytest.mark.flaky_in_dev # Issue #2856
98100
@pytest.mark.parametrize(('tp,pp'), [(2, 4)])
99101
def test_no_global_metadata_reuse_on_different_parallelism(self, tmp_path_dist_ckpt, tp, pp):
100102
Utils.initialize_model_parallel(tp, pp)

tests/unit_tests/dist_checkpointing/test_local.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -255,7 +255,8 @@ def deterministic_empty(*args, **kwargs):
255255
@pytest.mark.parametrize(('use_ramdisk'), [True, False])
256256
@pytest.mark.parametrize(('async_save'), [True, False])
257257
@pytest.mark.parametrize(('algo'), ['atomic', 'fully_parallel'])
258-
@pytest.mark.flaky_in_dev
258+
@pytest.mark.flaky
259+
@pytest.mark.flaky_in_dev
259260
def test_failed_save(self, caplog, tmp_path_dist_ckpt, tp, pp, use_ramdisk, async_save, algo):
260261
Utils.initialize_model_parallel(tp, pp)
261262
num_floating_point_operations_so_far = 0

tests/unit_tests/dist_checkpointing/test_serialization.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -547,7 +547,8 @@ def test_tensor_shape_mismatch(self, tmp_path_dist_ckpt):
547547
not is_torch_min_version("2.3.0"),
548548
reason="remove_sharded_tensors relies on Torch APIs introduced in v2.3.0",
549549
)
550-
@pytest.mark.flaky_in_dev
550+
@pytest.mark.flaky
551+
@pytest.mark.flaky_in_dev
551552
def test_remove_sharded_tensors(self, tmp_path_dist_ckpt):
552553
Utils.initialize_model_parallel(2, 4)
553554

tests/unit_tests/distributed/test_grad_sync_with_expert_parallel.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,7 @@ def get_moe_model_and_buffers(
109109
@pytest.mark.parametrize("etp_size", [1, 2])
110110
@pytest.mark.parametrize("num_distributed_optimizer_instances", [1, 2])
111111
@pytest.mark.flaky
112+
@pytest.mark.flaky
112113
@pytest.mark.flaky_in_dev
113114
def test_grad_sync(
114115
use_distributed_optimizer: bool,

tests/unit_tests/distributed/test_mcore_fully_sharded_data_parallel.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020

2121

2222
# Test model for testing FSDP
23+
@pytest.mark.flaky
2324
@pytest.mark.flaky_in_dev
2425
class TestModel(torch.nn.Module):
2526
def __init__(self, input_dim, output_dim):
@@ -36,6 +37,7 @@ def forward(self, x):
3637

3738

3839
# Test model with uniform shaped weights for testing FSDP
40+
@pytest.mark.flaky
3941
@pytest.mark.flaky_in_dev
4042
class TestModelUniform(torch.nn.Module):
4143
def __init__(self, hidden_dim):
@@ -67,6 +69,7 @@ def setup_seed(seed):
6769
torch.backends.cudnn.benchmark = False # Disable auto-tuner for reproducibility
6870

6971

72+
@pytest.mark.flaky
7073
@pytest.mark.flaky_in_dev
7174
class TestFullyShardedDataParallel:
7275
@classmethod

tests/unit_tests/models/test_bert_model.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -167,7 +167,8 @@ def test_transformer_engine_version_1_7_to_1_10_flash_attn(self, mocker):
167167
), f"Expected b11s for attn_mask_dimensions but got {attn_mask_dimensions}"
168168

169169
@pytest.mark.internal
170-
@pytest.mark.flaky_in_dev
170+
@pytest.mark.flaky
171+
@pytest.mark.flaky_in_dev
171172
def test_transformer_engine_version_1_7_to_1_10_rng_error(self, mocker):
172173
bert_layer_with_transformer_engine_spec.submodules.self_attention.params[
173174
'attn_mask_type'

0 commit comments

Comments
 (0)