diff --git a/tests/functional_tests/test_cases/bert/bert_mcore_tp1_pp2/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_mcore.yaml similarity index 92% rename from tests/functional_tests/test_cases/bert/bert_mcore_tp1_pp2/model_config.yaml rename to tests/functional_tests/test_cases/bert/bert_mcore.yaml index 60537ce877..94053ffb2e 100644 --- a/tests/functional_tests/test_cases/bert/bert_mcore_tp1_pp2/model_config.yaml +++ b/tests/functional_tests/test_cases/bert/bert_mcore.yaml @@ -3,6 +3,7 @@ ENV_VARS: NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 NCCL_ALGO: Ring CUBLAS_WORKSPACE_CONFIG: :4096:8 +TEST_TYPE: ckpt-resume MODEL_ARGS: --num-layers: 24 --hidden-size: 1024 @@ -33,12 +34,10 @@ MODEL_ARGS: --save-interval: 10000 --eval-interval: 1000 --eval-iters: 10 - --tensor-model-parallel-size: 1 - --pipeline-model-parallel-size: 2 --deterministic-mode: true --no-gradient-accumulation-fusion: true --data-cache-path: ${DATA_CACHE_PATH} --bf16: true --ckpt-format: torch --attention-backend: unfused -TEST_TYPE: regular + --dist-ckpt-strictness: log_all # backward compatibility for TE changes diff --git a/tests/functional_tests/test_cases/bert/bert_mcore_tp1_pp4_vp2/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_mcore_tp1_pp4_vp2/model_config.yaml deleted file mode 100644 index 0e90838145..0000000000 --- a/tests/functional_tests/test_cases/bert/bert_mcore_tp1_pp4_vp2/model_config.yaml +++ /dev/null @@ -1,46 +0,0 @@ -ENV_VARS: - CUDA_DEVICE_MAX_CONNECTIONS: 1 - NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 - NCCL_ALGO: Ring - CUBLAS_WORKSPACE_CONFIG: :4096:8 -MODEL_ARGS: - --num-layers: 24 - --hidden-size: 1024 - --num-attention-heads: 16 - --log-params-norm: true - --log-num-zeros-in-grad: true - --log-validation-ppl-to-tensorboard: true - --log-timers-to-tensorboard: true - --log-memory-to-tensorboard: true - --tensorboard-dir: ${TENSORBOARD_PATH} - --micro-batch-size: 4 - --global-batch-size: 128 - --seq-length: 512 - --max-position-embeddings: 512 - --train-iters: 50 - --timing-log-level: 0 - --lr-decay-iters: 990000 - --save: ${CHECKPOINT_SAVE_PATH} - --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/text/the_pile/bert_shard00/my-bert_00_text_sentence - --vocab-file: ${DATA_PATH}/text/the_pile/bert_shard00/vocab.txt - --split: 949,50,1 - --distributed-backend: nccl - --lr: 0.0001 - --min-lr: 0.00001 - --lr-warmup-fraction: 0.01 - --log-interval: 1 - --save-interval: 10000 - --eval-interval: 1000 - --eval-iters: 10 - --tensor-model-parallel-size: 1 - --pipeline-model-parallel-size: 4 - --num-layers-per-virtual-pipeline-stage: 2 - --deterministic-mode: true - --no-gradient-accumulation-fusion: true - --data-cache-path: ${DATA_CACHE_PATH} - --bf16: true - --ckpt-format: torch - --attention-backend: unfused - --legacy-tokenizer: true -TEST_TYPE: regular diff --git a/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2/model_config.yaml deleted file mode 100644 index f965ee1d9e..0000000000 --- a/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2/model_config.yaml +++ /dev/null @@ -1,44 +0,0 @@ -ENV_VARS: - CUDA_DEVICE_MAX_CONNECTIONS: 1 - NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 - NCCL_ALGO: Ring - CUBLAS_WORKSPACE_CONFIG: :4096:8 -MODEL_ARGS: - --num-layers: 24 - --hidden-size: 1024 - --num-attention-heads: 16 - --log-params-norm: true - --log-num-zeros-in-grad: true - --log-validation-ppl-to-tensorboard: true - --log-timers-to-tensorboard: true - --log-memory-to-tensorboard: true - --tensorboard-dir: ${TENSORBOARD_PATH} - --micro-batch-size: 4 - --global-batch-size: 128 - --seq-length: 512 - --max-position-embeddings: 512 - --train-iters: 50 - --timing-log-level: 0 - --lr-decay-iters: 990000 - --save: ${CHECKPOINT_SAVE_PATH} - --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/text/the_pile/bert_shard00/my-bert_00_text_sentence - --vocab-file: ${DATA_PATH}/text/the_pile/bert_shard00/vocab.txt - --split: 949,50,1 - --distributed-backend: nccl - --lr: 0.0001 - --min-lr: 0.00001 - --lr-warmup-fraction: 0.01 - --log-interval: 1 - --save-interval: 10000 - --eval-interval: 1000 - --eval-iters: 10 - --tensor-model-parallel-size: 2 - --pipeline-model-parallel-size: 2 - --deterministic-mode: true - --no-gradient-accumulation-fusion: true - --data-cache-path: ${DATA_CACHE_PATH} - --bf16: true - --ckpt-format: torch - --attention-backend: unfused -TEST_TYPE: regular diff --git a/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_frozen_resume_torch_dist/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_frozen_resume_torch_dist/golden_values_dev_dgx_a100.json deleted file mode 100644 index cc2045a806..0000000000 --- a/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_frozen_resume_torch_dist/golden_values_dev_dgx_a100.json +++ /dev/null @@ -1,142 +0,0 @@ -{ - "lm loss": { - "start_step": 1, - "end_step": 100, - "step_interval": 5, - "values": { - "1": 10.49689, - "5": 10.48165, - "10": 10.50192, - "15": 10.45891, - "20": 10.44599, - "25": 10.35067, - "30": 10.16617, - "35": 10.04377, - "40": 9.90903, - "45": 9.75804, - "50": 9.67525, - "55": 9.55381, - "60": 9.45437, - "65": 9.42265, - "70": 9.30033, - "75": 9.3248, - "80": 9.26115, - "85": 9.29647, - "90": 9.23205, - "95": 9.23789, - "100": 9.106 - } - }, - "num-zeros": { - "start_step": 1, - "end_step": 100, - "step_interval": 5, - "values": { - "1": 2071.0, - "5": 2603.0, - "10": 2120.0, - "15": 2502.0, - "20": 2235.0, - "25": 2509.0, - "30": 2938.0, - "35": 2948.0, - "40": 2197.0, - "45": 3921.0, - "50": 3479.0, - "55": 3577.0, - "60": 2699.0, - "65": 3580.0, - "70": 3903.0, - "75": 4779.0, - "80": 3441.0, - "85": 4133.0, - "90": 4705.0, - "95": 4363.0, - "100": 3205.0 - } - }, - "mem-allocated-bytes": { - "start_step": 1, - "end_step": 100, - "step_interval": 5, - "values": { - "1": 1754654208.0, - "5": 1754654208.0, - "10": 1754654208.0, - "15": 1754654208.0, - "20": 1754654208.0, - "25": 1754654208.0, - "30": 1754654208.0, - "35": 1754654208.0, - "40": 1754654208.0, - "45": 1754654208.0, - "50": 1754654208.0, - "55": 1754654208.0, - "60": 1754654208.0, - "65": 1754654208.0, - "70": 1754654208.0, - "75": 1754654208.0, - "80": 1754654208.0, - "85": 1754654208.0, - "90": 1754654208.0, - "95": 1754654208.0, - "100": 1754654208.0 - } - }, - "mem-max-allocated-bytes": { - "start_step": 1, - "end_step": 100, - "step_interval": 5, - "values": { - "1": 2313432064.0, - "5": 3055894528.0, - "10": 3055894528.0, - "15": 3055894528.0, - "20": 3055894528.0, - "25": 3055894528.0, - "30": 3055894528.0, - "35": 3055894528.0, - "40": 3055894528.0, - "45": 3055894528.0, - "50": 3055894528.0, - "55": 3055894528.0, - "60": 3055894528.0, - "65": 3055894528.0, - "70": 3055894528.0, - "75": 3055894528.0, - "80": 3055894528.0, - "85": 3055894528.0, - "90": 3055894528.0, - "95": 3055894528.0, - "100": 3055894528.0 - } - }, - "iteration-time": { - "start_step": 1, - "end_step": 100, - "step_interval": 5, - "values": { - "1": 8.5415, - "5": 1.31601, - "10": 1.30283, - "15": 1.30113, - "20": 1.36717, - "25": 1.30303, - "30": 1.3046, - "35": 1.30812, - "40": 1.33609, - "45": 1.29932, - "50": 1.29774, - "55": 1.3003, - "60": 1.30422, - "65": 1.29772, - "70": 1.29885, - "75": 1.30735, - "80": 1.3284, - "85": 1.30253, - "90": 1.3315, - "95": 1.30266, - "100": 1.3038 - } - } -} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_frozen_resume_torch_dist/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_frozen_resume_torch_dist/golden_values_dev_dgx_h100.json deleted file mode 100644 index 9d95cec6ff..0000000000 --- a/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_frozen_resume_torch_dist/golden_values_dev_dgx_h100.json +++ /dev/null @@ -1 +0,0 @@ -{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.48367, "5": 10.4764, "10": 10.47268, "15": 10.47925, "20": 10.45448, "25": 10.38182, "30": 10.21159, "35": 10.10576, "40": 9.98131, "45": 9.82324, "50": 9.72977, "55": 9.61012, "60": 9.51845, "65": 9.4581, "70": 9.37599, "75": 9.37873, "80": 9.31495, "85": 9.35008, "90": 9.2849, "95": 9.27998, "100": 9.14808}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 2570.0, "5": 2033.0, "10": 2635.0, "15": 2027.0, "20": 2850.0, "25": 2514.0, "30": 2858.0, "35": 2396.0, "40": 3386.0, "45": 3575.0, "50": 2149.0, "55": 3552.0, "60": 2461.0, "65": 3090.0, "70": 4409.0, "75": 4761.0, "80": 3795.0, "85": 4392.0, "90": 4389.0, "95": 4668.0, "100": 3371.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1784014336.0, "5": 1784014336.0, "10": 1784014336.0, "15": 1784014336.0, "20": 1784014336.0, "25": 1784014336.0, "30": 1784014336.0, "35": 1784014336.0, "40": 1784014336.0, "45": 1784014336.0, "50": 1784014336.0, "55": 1784014336.0, "60": 1784014336.0, "65": 1784014336.0, "70": 1784014336.0, "75": 1784014336.0, "80": 1784014336.0, "85": 1784014336.0, "90": 1784014336.0, "95": 1784014336.0, "100": 1784014336.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 2365860864.0, "5": 3108323328.0, "10": 3108323328.0, "15": 3108323328.0, "20": 3108323328.0, "25": 3108323328.0, "30": 3108323328.0, "35": 3108323328.0, "40": 3108323328.0, "45": 3108323328.0, "50": 3108323328.0, "55": 3108323328.0, "60": 3108323328.0, "65": 3108323328.0, "70": 3108323328.0, "75": 3108323328.0, "80": 3108323328.0, "85": 3108323328.0, "90": 3108323328.0, "95": 3108323328.0, "100": 3108323328.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.952, "5": 0.9152, "10": 0.90016, "15": 1.02052, "20": 0.83245, "25": 0.81421, "30": 0.82286, "35": 0.81792, "40": 0.87247, "45": 0.83604, "50": 0.8256, "55": 0.8313, "60": 1.12795, "65": 0.82458, "70": 0.83513, "75": 0.82314, "80": 0.96015, "85": 0.89432, "90": 0.8203, "95": 0.82739, "100": 0.88667}}} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_frozen_resume_torch_dist/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_frozen_resume_torch_dist/golden_values_lts_dgx_a100.json deleted file mode 100644 index 7c71e4a62d..0000000000 --- a/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_frozen_resume_torch_dist/golden_values_lts_dgx_a100.json +++ /dev/null @@ -1 +0,0 @@ -{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.49574, "5": 10.48398, "10": 10.49943, "15": 10.4663, "20": 10.44775, "25": 10.34964, "30": 10.1728, "35": 10.04262, "40": 9.90767, "45": 9.75792, "50": 9.67684, "55": 9.55378, "60": 9.45458, "65": 9.42133, "70": 9.30109, "75": 9.32203, "80": 9.26184, "85": 9.29667, "90": 9.23332, "95": 9.23793, "100": 9.10611}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 2182.0, "5": 2568.0, "10": 2108.0, "15": 2533.0, "20": 2151.0, "25": 2601.0, "30": 2801.0, "35": 3107.0, "40": 2294.0, "45": 3909.0, "50": 3482.0, "55": 3606.0, "60": 2653.0, "65": 3341.0, "70": 3849.0, "75": 5090.0, "80": 3613.0, "85": 4194.0, "90": 4618.0, "95": 4439.0, "100": 3224.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1754654208.0, "5": 1754654208.0, "10": 1754654208.0, "15": 1754654208.0, "20": 1754654208.0, "25": 1754654208.0, "30": 1754654208.0, "35": 1754654208.0, "40": 1754654208.0, "45": 1754654208.0, "50": 1754654208.0, "55": 1754654208.0, "60": 1754654208.0, "65": 1754654208.0, "70": 1754654208.0, "75": 1754654208.0, "80": 1754654208.0, "85": 1754654208.0, "90": 1754654208.0, "95": 1754654208.0, "100": 1754654208.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 12.95742, "5": 1.16734, "10": 2.45473, "15": 1.45839, "20": 1.51474, "25": 1.15989, "30": 1.14801, "35": 1.14584, "40": 1.15517, "45": 1.14468, "50": 1.14969, "55": 1.15684, "60": 1.14892, "65": 1.14737, "70": 1.30233, "75": 1.37176, "80": 1.1466, "85": 1.24468, "90": 1.15157, "95": 1.15026, "100": 1.15254}}} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_frozen_resume_torch_dist/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_frozen_resume_torch_dist/model_config.yaml deleted file mode 100644 index fc4c836c98..0000000000 --- a/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_frozen_resume_torch_dist/model_config.yaml +++ /dev/null @@ -1,47 +0,0 @@ -ENV_VARS: - CUDA_DEVICE_MAX_CONNECTIONS: 1 - NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 - NCCL_ALGO: Ring - CUBLAS_WORKSPACE_CONFIG: :4096:8 -MODEL_ARGS: - --num-layers: 24 - --hidden-size: 1024 - --num-attention-heads: 16 - --log-params-norm: true - --log-num-zeros-in-grad: true - --log-validation-ppl-to-tensorboard: true - --log-timers-to-tensorboard: true - --log-memory-to-tensorboard: true - --tensorboard-dir: ${TENSORBOARD_PATH} - --micro-batch-size: 4 - --global-batch-size: 128 - --seq-length: 512 - --max-position-embeddings: 512 - --train-iters: 100 - --timing-log-level: 0 - --lr-decay-iters: 990000 - --save: ${CHECKPOINT_SAVE_PATH} - --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/text/the_pile/bert_shard00/my-bert_00_text_sentence - --vocab-file: ${DATA_PATH}/text/the_pile/bert_shard00/vocab.txt - --split: 949,50,1 - --distributed-backend: nccl - --lr: 0.0001 - --min-lr: 0.00001 - --lr-warmup-fraction: 0.01 - --log-interval: 1 - --save-interval: 50 - --eval-interval: 1000 - --eval-iters: 10 - --tensor-model-parallel-size: 2 - --pipeline-model-parallel-size: 2 - --deterministic-mode: true - --use-checkpoint-args: true - --use-checkpoint-opt_param-scheduler: true - --no-gradient-accumulation-fusion: true - --data-cache-path: ${DATA_CACHE_PATH} - --bf16: true - --ckpt-format: torch - --dist-ckpt-strictness: log_all # backward compatibility for TE changes - --attention-backend: unfused -TEST_TYPE: frozen-resume diff --git a/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_local_spec/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_local_spec/golden_values_dev_dgx_a100.json deleted file mode 100644 index 1ddb77e6ec..0000000000 --- a/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_local_spec/golden_values_dev_dgx_a100.json +++ /dev/null @@ -1 +0,0 @@ -{"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.49711, "5": 10.48154, "10": 10.50197, "15": 10.45909, "20": 10.44614, "25": 10.35085, "30": 10.16654, "35": 10.04394, "40": 9.9092, "45": 9.75814, "50": 9.67518}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 2095.0, "5": 2660.0, "10": 2235.0, "15": 2491.0, "20": 2216.0, "25": 2531.0, "30": 2718.0, "35": 2945.0, "40": 2310.0, "45": 3831.0, "50": 3502.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1767237120.0, "5": 1767237120.0, "10": 1767237120.0, "15": 1767237120.0, "20": 1767237120.0, "25": 1767237120.0, "30": 1767237120.0, "35": 1767237120.0, "40": 1767237120.0, "45": 1767237120.0, "50": 1767237120.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 2313432064.0, "5": 3055894528.0, "10": 3055894528.0, "15": 3055894528.0, "20": 3055894528.0, "25": 3055894528.0, "30": 3055894528.0, "35": 3055894528.0, "40": 3055894528.0, "45": 3055894528.0, "50": 3055894528.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 8.74232, "5": 1.24548, "10": 1.13693, "15": 1.14447, "20": 1.37773, "25": 1.13543, "30": 1.13581, "35": 1.13373, "40": 1.13802, "45": 1.1302, "50": 1.13618}}} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_local_spec/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_local_spec/golden_values_dev_dgx_h100.json deleted file mode 100644 index fb44f049ad..0000000000 --- a/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_local_spec/golden_values_dev_dgx_h100.json +++ /dev/null @@ -1,287 +0,0 @@ -{ - "lm loss": { - "start_step": 1, - "end_step": 50, - "step_interval": 1, - "values": { - "1": 10.4837, - "2": 10.48435, - "3": 10.48251, - "4": 10.48303, - "5": 10.47647, - "6": 10.48423, - "7": 10.48457, - "8": 10.48837, - "9": 10.49003, - "10": 10.47255, - "11": 10.47245, - "12": 10.4828, - "13": 10.47855, - "14": 10.45162, - "15": 10.47936, - "16": 10.45364, - "17": 10.45143, - "18": 10.46239, - "19": 10.44136, - "20": 10.45438, - "21": 10.43469, - "22": 10.40587, - "23": 10.39982, - "24": 10.37585, - "25": 10.38173, - "26": 10.35154, - "27": 10.35401, - "28": 10.3497, - "29": 10.28714, - "30": 10.21194, - "31": 10.17274, - "32": 10.13439, - "33": 10.14753, - "34": 10.10759, - "35": 10.10592, - "36": 10.08756, - "37": 10.08177, - "38": 10.07257, - "39": 10.0013, - "40": 9.9816, - "41": 9.92549, - "42": 9.87529, - "43": 9.88742, - "44": 9.80641, - "45": 9.82342, - "46": 9.73815, - "47": 9.74831, - "48": 9.71619, - "49": 9.74504, - "50": 9.73004 - } - }, - "num-zeros": { - "start_step": 1, - "end_step": 50, - "step_interval": 1, - "values": { - "1": 2554.0, - "2": 1919.0, - "3": 1521.0, - "4": 2330.0, - "5": 2010.0, - "6": 1725.0, - "7": 2803.0, - "8": 2435.0, - "9": 2286.0, - "10": 2570.0, - "11": 2438.0, - "12": 1829.0, - "13": 2332.0, - "14": 2832.0, - "15": 2008.0, - "16": 2659.0, - "17": 2454.0, - "18": 2500.0, - "19": 2588.0, - "20": 2834.0, - "21": 2042.0, - "22": 3037.0, - "23": 2702.0, - "24": 2700.0, - "25": 2568.0, - "26": 2896.0, - "27": 2735.0, - "28": 2699.0, - "29": 2548.0, - "30": 2843.0, - "31": 2160.0, - "32": 2458.0, - "33": 2130.0, - "34": 2517.0, - "35": 2597.0, - "36": 3001.0, - "37": 3305.0, - "38": 2682.0, - "39": 2805.0, - "40": 3425.0, - "41": 1812.0, - "42": 1481.0, - "43": 1726.0, - "44": 2575.0, - "45": 3438.0, - "46": 2960.0, - "47": 2792.0, - "48": 3107.0, - "49": 2854.0, - "50": 2145.0 - } - }, - "mem-allocated-bytes": { - "start_step": 1, - "end_step": 50, - "step_interval": 1, - "values": { - "1": 1767237120.0, - "2": 1767237120.0, - "3": 1767237120.0, - "4": 1767237120.0, - "5": 1767237120.0, - "6": 1767237120.0, - "7": 1767237120.0, - "8": 1767237120.0, - "9": 1767237120.0, - "10": 1767237120.0, - "11": 1767237120.0, - "12": 1767237120.0, - "13": 1767237120.0, - "14": 1767237120.0, - "15": 1767237120.0, - "16": 1767237120.0, - "17": 1767237120.0, - "18": 1767237120.0, - "19": 1767237120.0, - "20": 1767237120.0, - "21": 1767237120.0, - "22": 1767237120.0, - "23": 1767237120.0, - "24": 1767237120.0, - "25": 1767237120.0, - "26": 1767237120.0, - "27": 1767237120.0, - "28": 1767237120.0, - "29": 1767237120.0, - "30": 1767237120.0, - "31": 1767237120.0, - "32": 1767237120.0, - "33": 1767237120.0, - "34": 1767237120.0, - "35": 1767237120.0, - "36": 1767237120.0, - "37": 1767237120.0, - "38": 1767237120.0, - "39": 1767237120.0, - "40": 1767237120.0, - "41": 1767237120.0, - "42": 1767237120.0, - "43": 1767237120.0, - "44": 1767237120.0, - "45": 1767237120.0, - "46": 1767237120.0, - "47": 1767237120.0, - "48": 1767237120.0, - "49": 1767237120.0, - "50": 1767237120.0 - } - }, - "mem-max-allocated-bytes": { - "start_step": 1, - "end_step": 50, - "step_interval": 1, - "values": { - "1": 2336500736.0, - "2": 3079487488.0, - "3": 3079487488.0, - "4": 3079487488.0, - "5": 3079487488.0, - "6": 3079487488.0, - "7": 3079487488.0, - "8": 3079487488.0, - "9": 3079487488.0, - "10": 3079487488.0, - "11": 3079487488.0, - "12": 3079487488.0, - "13": 3079487488.0, - "14": 3079487488.0, - "15": 3079487488.0, - "16": 3079487488.0, - "17": 3079487488.0, - "18": 3079487488.0, - "19": 3079487488.0, - "20": 3079487488.0, - "21": 3079487488.0, - "22": 3079487488.0, - "23": 3079487488.0, - "24": 3079487488.0, - "25": 3079487488.0, - "26": 3079487488.0, - "27": 3079487488.0, - "28": 3079487488.0, - "29": 3079487488.0, - "30": 3079487488.0, - "31": 3079487488.0, - "32": 3079487488.0, - "33": 3079487488.0, - "34": 3079487488.0, - "35": 3079487488.0, - "36": 3079487488.0, - "37": 3079487488.0, - "38": 3079487488.0, - "39": 3079487488.0, - "40": 3079487488.0, - "41": 3079487488.0, - "42": 3079487488.0, - "43": 3079487488.0, - "44": 3079487488.0, - "45": 3079487488.0, - "46": 3079487488.0, - "47": 3079487488.0, - "48": 3079487488.0, - "49": 3079487488.0, - "50": 3079487488.0 - } - }, - "iteration-time": { - "start_step": 1, - "end_step": 50, - "step_interval": 1, - "values": { - "1": 11.5674, - "2": 0.87925, - "3": 0.84214, - "4": 0.85037, - "5": 0.85134, - "6": 0.84821, - "7": 0.84955, - "8": 0.84912, - "9": 0.85227, - "10": 0.84641, - "11": 0.84805, - "12": 0.84791, - "13": 0.86059, - "14": 0.86196, - "15": 1.10537, - "16": 1.03739, - "17": 0.8309, - "18": 0.82806, - "19": 1.30044, - "20": 0.83029, - "21": 0.82677, - "22": 1.30745, - "23": 0.85382, - "24": 0.83942, - "25": 0.83871, - "26": 0.8337, - "27": 0.83434, - "28": 0.8309, - "29": 0.83936, - "30": 0.83788, - "31": 0.83476, - "32": 0.83236, - "33": 0.83163, - "34": 0.84328, - "35": 0.83702, - "36": 0.83877, - "37": 0.83834, - "38": 0.83145, - "39": 0.83941, - "40": 0.84432, - "41": 1.16619, - "42": 1.1534, - "43": 1.08513, - "44": 0.84537, - "45": 0.99113, - "46": 0.84419, - "47": 0.89066, - "48": 0.83549, - "49": 1.01154, - "50": 0.96557 - } - } -} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_local_spec/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_local_spec/golden_values_dev_dgxh100_coreweave.json deleted file mode 100644 index 0ff198806c..0000000000 --- a/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_local_spec/golden_values_dev_dgxh100_coreweave.json +++ /dev/null @@ -1,287 +0,0 @@ -{ - "lm loss": { - "start_step": 1, - "end_step": 50, - "step_interval": 1, - "values": { - "1": 10.4837, - "2": 10.48435, - "3": 10.48251, - "4": 10.48303, - "5": 10.47647, - "6": 10.48423, - "7": 10.48457, - "8": 10.48837, - "9": 10.49003, - "10": 10.47255, - "11": 10.47245, - "12": 10.4828, - "13": 10.47855, - "14": 10.45162, - "15": 10.47936, - "16": 10.45364, - "17": 10.45143, - "18": 10.46239, - "19": 10.44136, - "20": 10.45438, - "21": 10.43469, - "22": 10.40587, - "23": 10.39982, - "24": 10.37585, - "25": 10.38173, - "26": 10.35154, - "27": 10.35401, - "28": 10.3497, - "29": 10.28714, - "30": 10.21194, - "31": 10.17274, - "32": 10.13439, - "33": 10.14753, - "34": 10.10759, - "35": 10.10592, - "36": 10.08756, - "37": 10.08177, - "38": 10.07257, - "39": 10.0013, - "40": 9.9816, - "41": 9.92549, - "42": 9.87529, - "43": 9.88742, - "44": 9.80641, - "45": 9.82342, - "46": 9.73815, - "47": 9.74831, - "48": 9.71619, - "49": 9.74504, - "50": 9.73004 - } - }, - "num-zeros": { - "start_step": 1, - "end_step": 50, - "step_interval": 1, - "values": { - "1": 2554.0, - "2": 1919.0, - "3": 1521.0, - "4": 2330.0, - "5": 2010.0, - "6": 1725.0, - "7": 2803.0, - "8": 2435.0, - "9": 2286.0, - "10": 2570.0, - "11": 2438.0, - "12": 1829.0, - "13": 2332.0, - "14": 2832.0, - "15": 2008.0, - "16": 2659.0, - "17": 2454.0, - "18": 2500.0, - "19": 2588.0, - "20": 2834.0, - "21": 2042.0, - "22": 3037.0, - "23": 2702.0, - "24": 2700.0, - "25": 2568.0, - "26": 2896.0, - "27": 2735.0, - "28": 2699.0, - "29": 2548.0, - "30": 2843.0, - "31": 2160.0, - "32": 2458.0, - "33": 2130.0, - "34": 2517.0, - "35": 2597.0, - "36": 3001.0, - "37": 3305.0, - "38": 2682.0, - "39": 2805.0, - "40": 3425.0, - "41": 1812.0, - "42": 1481.0, - "43": 1726.0, - "44": 2575.0, - "45": 3438.0, - "46": 2960.0, - "47": 2792.0, - "48": 3107.0, - "49": 2854.0, - "50": 2145.0 - } - }, - "mem-allocated-bytes": { - "start_step": 1, - "end_step": 50, - "step_interval": 1, - "values": { - "1": 1767237120.0, - "2": 1767237120.0, - "3": 1767237120.0, - "4": 1767237120.0, - "5": 1767237120.0, - "6": 1767237120.0, - "7": 1767237120.0, - "8": 1767237120.0, - "9": 1767237120.0, - "10": 1767237120.0, - "11": 1767237120.0, - "12": 1767237120.0, - "13": 1767237120.0, - "14": 1767237120.0, - "15": 1767237120.0, - "16": 1767237120.0, - "17": 1767237120.0, - "18": 1767237120.0, - "19": 1767237120.0, - "20": 1767237120.0, - "21": 1767237120.0, - "22": 1767237120.0, - "23": 1767237120.0, - "24": 1767237120.0, - "25": 1767237120.0, - "26": 1767237120.0, - "27": 1767237120.0, - "28": 1767237120.0, - "29": 1767237120.0, - "30": 1767237120.0, - "31": 1767237120.0, - "32": 1767237120.0, - "33": 1767237120.0, - "34": 1767237120.0, - "35": 1767237120.0, - "36": 1767237120.0, - "37": 1767237120.0, - "38": 1767237120.0, - "39": 1767237120.0, - "40": 1767237120.0, - "41": 1767237120.0, - "42": 1767237120.0, - "43": 1767237120.0, - "44": 1767237120.0, - "45": 1767237120.0, - "46": 1767237120.0, - "47": 1767237120.0, - "48": 1767237120.0, - "49": 1767237120.0, - "50": 1767237120.0 - } - }, - "mem-max-allocated-bytes": { - "start_step": 1, - "end_step": 50, - "step_interval": 1, - "values": { - "1": 2336500736.0, - "2": 3079487488.0, - "3": 3079487488.0, - "4": 3079487488.0, - "5": 3079487488.0, - "6": 3079487488.0, - "7": 3079487488.0, - "8": 3079487488.0, - "9": 3079487488.0, - "10": 3079487488.0, - "11": 3079487488.0, - "12": 3079487488.0, - "13": 3079487488.0, - "14": 3079487488.0, - "15": 3079487488.0, - "16": 3079487488.0, - "17": 3079487488.0, - "18": 3079487488.0, - "19": 3079487488.0, - "20": 3079487488.0, - "21": 3079487488.0, - "22": 3079487488.0, - "23": 3079487488.0, - "24": 3079487488.0, - "25": 3079487488.0, - "26": 3079487488.0, - "27": 3079487488.0, - "28": 3079487488.0, - "29": 3079487488.0, - "30": 3079487488.0, - "31": 3079487488.0, - "32": 3079487488.0, - "33": 3079487488.0, - "34": 3079487488.0, - "35": 3079487488.0, - "36": 3079487488.0, - "37": 3079487488.0, - "38": 3079487488.0, - "39": 3079487488.0, - "40": 3079487488.0, - "41": 3079487488.0, - "42": 3079487488.0, - "43": 3079487488.0, - "44": 3079487488.0, - "45": 3079487488.0, - "46": 3079487488.0, - "47": 3079487488.0, - "48": 3079487488.0, - "49": 3079487488.0, - "50": 3079487488.0 - } - }, - "iteration-time": { - "start_step": 1, - "end_step": 50, - "step_interval": 1, - "values": { - "1": 12.70758, - "2": 0.8354, - "3": 0.78875, - "4": 0.77893, - "5": 0.81797, - "6": 0.77299, - "7": 0.76726, - "8": 0.77744, - "9": 0.77036, - "10": 0.76808, - "11": 0.77009, - "12": 0.77543, - "13": 0.78463, - "14": 0.77498, - "15": 0.76065, - "16": 1.28888, - "17": 0.78476, - "18": 0.77415, - "19": 0.77341, - "20": 1.04994, - "21": 1.25413, - "22": 0.7709, - "23": 0.85615, - "24": 0.76186, - "25": 0.75903, - "26": 0.75431, - "27": 0.76868, - "28": 0.7776, - "29": 0.74989, - "30": 0.75136, - "31": 0.7956, - "32": 0.74247, - "33": 0.73237, - "34": 0.73066, - "35": 0.74241, - "36": 0.74361, - "37": 0.77983, - "38": 0.77753, - "39": 0.75036, - "40": 0.75188, - "41": 0.75332, - "42": 0.89635, - "43": 0.73883, - "44": 0.92932, - "45": 0.73444, - "46": 0.73103, - "47": 1.01543, - "48": 1.06091, - "49": 0.92342, - "50": 1.25669 - } - } -} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_local_spec/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_local_spec/golden_values_dev_dgxh100_eos.json deleted file mode 100644 index bf20b2b00e..0000000000 --- a/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_local_spec/golden_values_dev_dgxh100_eos.json +++ /dev/null @@ -1,287 +0,0 @@ -{ - "lm loss": { - "start_step": 1, - "end_step": 50, - "step_interval": 1, - "values": { - "1": 10.4837, - "2": 10.48435, - "3": 10.48251, - "4": 10.48303, - "5": 10.47647, - "6": 10.48423, - "7": 10.48457, - "8": 10.48837, - "9": 10.49003, - "10": 10.47255, - "11": 10.47245, - "12": 10.4828, - "13": 10.47855, - "14": 10.45162, - "15": 10.47936, - "16": 10.45364, - "17": 10.45143, - "18": 10.46239, - "19": 10.44136, - "20": 10.45438, - "21": 10.43469, - "22": 10.40587, - "23": 10.39982, - "24": 10.37585, - "25": 10.38173, - "26": 10.35154, - "27": 10.35401, - "28": 10.3497, - "29": 10.28714, - "30": 10.21194, - "31": 10.17274, - "32": 10.13439, - "33": 10.14753, - "34": 10.10759, - "35": 10.10592, - "36": 10.08756, - "37": 10.08177, - "38": 10.07257, - "39": 10.0013, - "40": 9.9816, - "41": 9.92549, - "42": 9.87529, - "43": 9.88742, - "44": 9.80641, - "45": 9.82342, - "46": 9.73815, - "47": 9.74831, - "48": 9.71619, - "49": 9.74504, - "50": 9.73004 - } - }, - "num-zeros": { - "start_step": 1, - "end_step": 50, - "step_interval": 1, - "values": { - "1": 2554.0, - "2": 1919.0, - "3": 1521.0, - "4": 2330.0, - "5": 2010.0, - "6": 1725.0, - "7": 2803.0, - "8": 2435.0, - "9": 2286.0, - "10": 2570.0, - "11": 2438.0, - "12": 1829.0, - "13": 2332.0, - "14": 2832.0, - "15": 2008.0, - "16": 2659.0, - "17": 2454.0, - "18": 2500.0, - "19": 2588.0, - "20": 2834.0, - "21": 2042.0, - "22": 3037.0, - "23": 2702.0, - "24": 2700.0, - "25": 2568.0, - "26": 2896.0, - "27": 2735.0, - "28": 2699.0, - "29": 2548.0, - "30": 2843.0, - "31": 2160.0, - "32": 2458.0, - "33": 2130.0, - "34": 2517.0, - "35": 2597.0, - "36": 3001.0, - "37": 3305.0, - "38": 2682.0, - "39": 2805.0, - "40": 3425.0, - "41": 1812.0, - "42": 1481.0, - "43": 1726.0, - "44": 2575.0, - "45": 3438.0, - "46": 2960.0, - "47": 2792.0, - "48": 3107.0, - "49": 2854.0, - "50": 2145.0 - } - }, - "mem-allocated-bytes": { - "start_step": 1, - "end_step": 50, - "step_interval": 1, - "values": { - "1": 1767237120.0, - "2": 1767237120.0, - "3": 1767237120.0, - "4": 1767237120.0, - "5": 1767237120.0, - "6": 1767237120.0, - "7": 1767237120.0, - "8": 1767237120.0, - "9": 1767237120.0, - "10": 1767237120.0, - "11": 1767237120.0, - "12": 1767237120.0, - "13": 1767237120.0, - "14": 1767237120.0, - "15": 1767237120.0, - "16": 1767237120.0, - "17": 1767237120.0, - "18": 1767237120.0, - "19": 1767237120.0, - "20": 1767237120.0, - "21": 1767237120.0, - "22": 1767237120.0, - "23": 1767237120.0, - "24": 1767237120.0, - "25": 1767237120.0, - "26": 1767237120.0, - "27": 1767237120.0, - "28": 1767237120.0, - "29": 1767237120.0, - "30": 1767237120.0, - "31": 1767237120.0, - "32": 1767237120.0, - "33": 1767237120.0, - "34": 1767237120.0, - "35": 1767237120.0, - "36": 1767237120.0, - "37": 1767237120.0, - "38": 1767237120.0, - "39": 1767237120.0, - "40": 1767237120.0, - "41": 1767237120.0, - "42": 1767237120.0, - "43": 1767237120.0, - "44": 1767237120.0, - "45": 1767237120.0, - "46": 1767237120.0, - "47": 1767237120.0, - "48": 1767237120.0, - "49": 1767237120.0, - "50": 1767237120.0 - } - }, - "mem-max-allocated-bytes": { - "start_step": 1, - "end_step": 50, - "step_interval": 1, - "values": { - "1": 2336500736.0, - "2": 3079487488.0, - "3": 3079487488.0, - "4": 3079487488.0, - "5": 3079487488.0, - "6": 3079487488.0, - "7": 3079487488.0, - "8": 3079487488.0, - "9": 3079487488.0, - "10": 3079487488.0, - "11": 3079487488.0, - "12": 3079487488.0, - "13": 3079487488.0, - "14": 3079487488.0, - "15": 3079487488.0, - "16": 3079487488.0, - "17": 3079487488.0, - "18": 3079487488.0, - "19": 3079487488.0, - "20": 3079487488.0, - "21": 3079487488.0, - "22": 3079487488.0, - "23": 3079487488.0, - "24": 3079487488.0, - "25": 3079487488.0, - "26": 3079487488.0, - "27": 3079487488.0, - "28": 3079487488.0, - "29": 3079487488.0, - "30": 3079487488.0, - "31": 3079487488.0, - "32": 3079487488.0, - "33": 3079487488.0, - "34": 3079487488.0, - "35": 3079487488.0, - "36": 3079487488.0, - "37": 3079487488.0, - "38": 3079487488.0, - "39": 3079487488.0, - "40": 3079487488.0, - "41": 3079487488.0, - "42": 3079487488.0, - "43": 3079487488.0, - "44": 3079487488.0, - "45": 3079487488.0, - "46": 3079487488.0, - "47": 3079487488.0, - "48": 3079487488.0, - "49": 3079487488.0, - "50": 3079487488.0 - } - }, - "iteration-time": { - "start_step": 1, - "end_step": 50, - "step_interval": 1, - "values": { - "1": 11.68301, - "2": 0.87796, - "3": 0.84756, - "4": 0.85513, - "5": 0.85643, - "6": 0.85366, - "7": 0.8468, - "8": 0.84974, - "9": 0.84989, - "10": 0.8464, - "11": 0.84369, - "12": 0.84972, - "13": 0.84311, - "14": 0.85648, - "15": 1.1084, - "16": 0.8827, - "17": 0.87952, - "18": 0.88554, - "19": 0.82673, - "20": 0.82222, - "21": 1.06414, - "22": 1.09134, - "23": 1.02591, - "24": 0.82601, - "25": 0.82277, - "26": 0.81844, - "27": 0.82627, - "28": 0.82854, - "29": 0.82653, - "30": 0.82247, - "31": 0.82906, - "32": 0.82363, - "33": 0.82944, - "34": 0.82401, - "35": 0.82902, - "36": 0.83537, - "37": 0.8265, - "38": 0.82728, - "39": 0.82087, - "40": 0.82525, - "41": 0.82691, - "42": 1.14473, - "43": 0.97566, - "44": 0.82343, - "45": 0.82956, - "46": 0.82572, - "47": 0.83635, - "48": 0.94255, - "49": 0.99753, - "50": 1.10127 - } - } -} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_local_spec/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_local_spec/golden_values_lts_dgx_a100.json deleted file mode 100644 index ade275df95..0000000000 --- a/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_local_spec/golden_values_lts_dgx_a100.json +++ /dev/null @@ -1 +0,0 @@ -{"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.49566, "5": 10.48418, "10": 10.49947, "15": 10.46646, "20": 10.44788, "25": 10.34978, "30": 10.17275, "35": 10.04282, "40": 9.90773, "45": 9.75781, "50": 9.67689}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 2183.0, "5": 2533.0, "10": 2162.0, "15": 2548.0, "20": 2191.0, "25": 2557.0, "30": 2864.0, "35": 2979.0, "40": 2332.0, "45": 3931.0, "50": 3611.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1767237120.0, "5": 1767237120.0, "10": 1767237120.0, "15": 1767237120.0, "20": 1767237120.0, "25": 1767237120.0, "30": 1767237120.0, "35": 1767237120.0, "40": 1767237120.0, "45": 1767237120.0, "50": 1767237120.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 2313432064.0, "5": 3055894528.0, "10": 3055894528.0, "15": 3055894528.0, "20": 3055894528.0, "25": 3055894528.0, "30": 3055894528.0, "35": 3055894528.0, "40": 3055894528.0, "45": 3055894528.0, "50": 3055894528.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 12.99544, "5": 1.22265, "10": 1.24105, "15": 1.21376, "20": 1.20754, "25": 1.21517, "30": 1.19626, "35": 1.22975, "40": 1.1839, "45": 1.17092, "50": 1.17649}}} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_local_spec/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_local_spec/model_config.yaml deleted file mode 100644 index 8974bc1ea2..0000000000 --- a/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_local_spec/model_config.yaml +++ /dev/null @@ -1,45 +0,0 @@ -ENV_VARS: - CUDA_DEVICE_MAX_CONNECTIONS: 1 - NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 - NCCL_ALGO: Ring - CUBLAS_WORKSPACE_CONFIG: :4096:8 -MODEL_ARGS: - --num-layers: 24 - --hidden-size: 1024 - --num-attention-heads: 16 - --log-params-norm: true - --log-num-zeros-in-grad: true - --log-validation-ppl-to-tensorboard: true - --log-timers-to-tensorboard: true - --log-memory-to-tensorboard: true - --tensorboard-dir: ${TENSORBOARD_PATH} - --micro-batch-size: 4 - --global-batch-size: 128 - --seq-length: 512 - --max-position-embeddings: 512 - --train-iters: 50 - --timing-log-level: 0 - --lr-decay-iters: 990000 - --save: ${CHECKPOINT_SAVE_PATH} - --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/text/the_pile/bert_shard00/my-bert_00_text_sentence - --vocab-file: ${DATA_PATH}/text/the_pile/bert_shard00/vocab.txt - --split: 949,50,1 - --distributed-backend: nccl - --lr: 0.0001 - --min-lr: 0.00001 - --lr-warmup-fraction: 0.01 - --log-interval: 1 - --save-interval: 10000 - --eval-interval: 1000 - --eval-iters: 10 - --tensor-model-parallel-size: 2 - --pipeline-model-parallel-size: 2 - --spec: local - --deterministic-mode: true - --no-gradient-accumulation-fusion: true - --data-cache-path: ${DATA_CACHE_PATH} - --bf16: true - --ckpt-format: torch - --attention-backend: local -TEST_TYPE: regular diff --git a/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_resume_torch_dist/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_resume_torch_dist/golden_values_dev_dgx_a100.json deleted file mode 100644 index c50e758ea6..0000000000 --- a/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_resume_torch_dist/golden_values_dev_dgx_a100.json +++ /dev/null @@ -1 +0,0 @@ -{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.49689, "5": 10.48187, "10": 10.50191, "15": 10.45897, "20": 10.44608, "25": 10.35095, "30": 10.16631, "35": 10.04387, "40": 9.90911, "45": 9.75816, "50": 9.67525, "55": 9.55379, "60": 9.45432, "65": 9.42258, "70": 9.30031, "75": 9.32482, "80": 9.26124, "85": 9.29638, "90": 9.23211, "95": 9.23802, "100": 9.106}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 2071.0, "5": 2593.0, "10": 2168.0, "15": 2415.0, "20": 2201.0, "25": 2577.0, "30": 2948.0, "35": 2983.0, "40": 2260.0, "45": 3953.0, "50": 3549.0, "55": 3586.0, "60": 2638.0, "65": 3507.0, "70": 3826.0, "75": 5012.0, "80": 3497.0, "85": 4326.0, "90": 4683.0, "95": 4357.0, "100": 3233.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1754654208.0, "5": 1754654208.0, "10": 1754654208.0, "15": 1754654208.0, "20": 1754654208.0, "25": 1754654208.0, "30": 1754654208.0, "35": 1754654208.0, "40": 1754654208.0, "45": 1754654208.0, "50": 1754654208.0, "55": 1754654208.0, "60": 1754654208.0, "65": 1754654208.0, "70": 1754654208.0, "75": 1754654208.0, "80": 1754654208.0, "85": 1754654208.0, "90": 1754654208.0, "95": 1754654208.0, "100": 1754654208.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 2313432064.0, "5": 3055894528.0, "10": 3055894528.0, "15": 3055894528.0, "20": 3055894528.0, "25": 3055894528.0, "30": 3055894528.0, "35": 3055894528.0, "40": 3055894528.0, "45": 3055894528.0, "50": 3055894528.0, "55": 3055894528.0, "60": 3055894528.0, "65": 3055894528.0, "70": 3055894528.0, "75": 3055894528.0, "80": 3055894528.0, "85": 3055894528.0, "90": 3055894528.0, "95": 3055894528.0, "100": 3055894528.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 11.38046, "5": 1.68311, "10": 1.68974, "15": 1.6798, "20": 1.68022, "25": 1.71075, "30": 1.67873, "35": 1.68199, "40": 1.68158, "45": 1.68705, "50": 1.68278, "55": 1.68023, "60": 1.67589, "65": 1.68002, "70": 1.67968, "75": 1.68272, "80": 1.70105, "85": 1.68925, "90": 1.70082, "95": 1.68015, "100": 1.68441}}} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_resume_torch_dist/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_resume_torch_dist/golden_values_dev_dgx_h100.json deleted file mode 100644 index 8063c89233..0000000000 --- a/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_resume_torch_dist/golden_values_dev_dgx_h100.json +++ /dev/null @@ -1,537 +0,0 @@ -{ - "lm loss": { - "start_step": 1, - "end_step": 100, - "step_interval": 1, - "values": { - "1": 10.48367, - "2": 10.48426, - "3": 10.48254, - "4": 10.48311, - "5": 10.4764, - "6": 10.4844, - "7": 10.48458, - "8": 10.48829, - "9": 10.49008, - "10": 10.47268, - "11": 10.47256, - "12": 10.48259, - "13": 10.47857, - "14": 10.45154, - "15": 10.47925, - "16": 10.45346, - "17": 10.45145, - "18": 10.46238, - "19": 10.44113, - "20": 10.45448, - "21": 10.43454, - "22": 10.40592, - "23": 10.39961, - "24": 10.37579, - "25": 10.38182, - "26": 10.35147, - "27": 10.35388, - "28": 10.34937, - "29": 10.28711, - "30": 10.21159, - "31": 10.1726, - "32": 10.13421, - "33": 10.14744, - "34": 10.10737, - "35": 10.10581, - "36": 10.08735, - "37": 10.08157, - "38": 10.07233, - "39": 10.00094, - "40": 9.98143, - "41": 9.92541, - "42": 9.87527, - "43": 9.88711, - "44": 9.80642, - "45": 9.82325, - "46": 9.73785, - "47": 9.74817, - "48": 9.71609, - "49": 9.74484, - "50": 9.72982, - "51": 9.71485, - "52": 9.66475, - "53": 9.60919, - "54": 9.62705, - "55": 9.61012, - "56": 9.617, - "57": 9.56786, - "58": 9.52731, - "59": 9.51668, - "60": 9.51865, - "61": 9.53132, - "62": 9.45016, - "63": 9.45725, - "64": 9.43435, - "65": 9.45801, - "66": 9.4368, - "67": 9.3968, - "68": 9.36474, - "69": 9.4095, - "70": 9.376, - "71": 9.41716, - "72": 9.42574, - "73": 9.37581, - "74": 9.41547, - "75": 9.37891, - "76": 9.28017, - "77": 9.32205, - "78": 9.35754, - "79": 9.32162, - "80": 9.31486, - "81": 9.2678, - "82": 9.34178, - "83": 9.32145, - "84": 9.24785, - "85": 9.35023, - "86": 9.22392, - "87": 9.3062, - "88": 9.29891, - "89": 9.22716, - "90": 9.28483, - "91": 9.23109, - "92": 9.27463, - "93": 9.19241, - "94": 9.23984, - "95": 9.28006, - "96": 9.17526, - "97": 9.21894, - "98": 9.17192, - "99": 9.16446, - "100": 9.14816 - } - }, - "num-zeros": { - "start_step": 1, - "end_step": 100, - "step_interval": 1, - "values": { - "1": 2570.0, - "2": 1923.0, - "3": 1512.0, - "4": 2322.0, - "5": 2033.0, - "6": 1774.0, - "7": 2781.0, - "8": 2460.0, - "9": 2308.0, - "10": 2635.0, - "11": 2397.0, - "12": 1817.0, - "13": 2348.0, - "14": 2749.0, - "15": 2027.0, - "16": 2719.0, - "17": 2487.0, - "18": 2533.0, - "19": 2547.0, - "20": 2850.0, - "21": 1990.0, - "22": 2884.0, - "23": 2857.0, - "24": 2685.0, - "25": 2514.0, - "26": 2958.0, - "27": 2673.0, - "28": 2723.0, - "29": 2571.0, - "30": 2858.0, - "31": 2157.0, - "32": 2357.0, - "33": 2242.0, - "34": 2464.0, - "35": 2544.0, - "36": 2933.0, - "37": 3293.0, - "38": 2730.0, - "39": 2795.0, - "40": 3310.0, - "41": 1816.0, - "42": 1467.0, - "43": 1817.0, - "44": 2633.0, - "45": 3576.0, - "46": 3015.0, - "47": 2805.0, - "48": 3071.0, - "49": 2974.0, - "50": 2267.0, - "51": 1923.0, - "52": 2515.0, - "53": 3615.0, - "54": 3426.0, - "55": 3436.0, - "56": 4411.0, - "57": 4095.0, - "58": 4308.0, - "59": 1687.0, - "60": 2431.0, - "61": 2151.0, - "62": 3986.0, - "63": 3558.0, - "64": 4286.0, - "65": 3052.0, - "66": 1720.0, - "67": 1910.0, - "68": 4193.0, - "69": 4347.0, - "70": 4596.0, - "71": 2078.0, - "72": 4406.0, - "73": 4062.0, - "74": 3358.0, - "75": 4606.0, - "76": 2187.0, - "77": 4854.0, - "78": 4098.0, - "79": 2652.0, - "80": 3776.0, - "81": 3550.0, - "82": 3031.0, - "83": 5345.0, - "84": 4396.0, - "85": 4354.0, - "86": 3332.0, - "87": 4815.0, - "88": 3303.0, - "89": 4611.0, - "90": 4346.0, - "91": 4361.0, - "92": 3502.0, - "93": 5624.0, - "94": 3733.0, - "95": 4728.0, - "96": 3534.0, - "97": 3873.0, - "98": 4525.0, - "99": 4329.0, - "100": 3365.0 - } - }, - "mem-allocated-bytes": { - "start_step": 1, - "end_step": 100, - "step_interval": 1, - "values": { - "1": 1784014336.0, - "2": 1784014336.0, - "3": 1784014336.0, - "4": 1784014336.0, - "5": 1784014336.0, - "6": 1784014336.0, - "7": 1784014336.0, - "8": 1784014336.0, - "9": 1784014336.0, - "10": 1784014336.0, - "11": 1784014336.0, - "12": 1784014336.0, - "13": 1784014336.0, - "14": 1784014336.0, - "15": 1784014336.0, - "16": 1784014336.0, - "17": 1784014336.0, - "18": 1784014336.0, - "19": 1784014336.0, - "20": 1784014336.0, - "21": 1784014336.0, - "22": 1784014336.0, - "23": 1784014336.0, - "24": 1784014336.0, - "25": 1784014336.0, - "26": 1784014336.0, - "27": 1784014336.0, - "28": 1784014336.0, - "29": 1784014336.0, - "30": 1784014336.0, - "31": 1784014336.0, - "32": 1784014336.0, - "33": 1784014336.0, - "34": 1784014336.0, - "35": 1784014336.0, - "36": 1784014336.0, - "37": 1784014336.0, - "38": 1784014336.0, - "39": 1784014336.0, - "40": 1784014336.0, - "41": 1784014336.0, - "42": 1784014336.0, - "43": 1784014336.0, - "44": 1784014336.0, - "45": 1784014336.0, - "46": 1784014336.0, - "47": 1784014336.0, - "48": 1784014336.0, - "49": 1784014336.0, - "50": 1784014336.0, - "51": 1784014336.0, - "52": 1784014336.0, - "53": 1784014336.0, - "54": 1784014336.0, - "55": 1784014336.0, - "56": 1784014336.0, - "57": 1784014336.0, - "58": 1784014336.0, - "59": 1784014336.0, - "60": 1784014336.0, - "61": 1784014336.0, - "62": 1784014336.0, - "63": 1784014336.0, - "64": 1784014336.0, - "65": 1784014336.0, - "66": 1784014336.0, - "67": 1784014336.0, - "68": 1784014336.0, - "69": 1784014336.0, - "70": 1784014336.0, - "71": 1784014336.0, - "72": 1784014336.0, - "73": 1784014336.0, - "74": 1784014336.0, - "75": 1784014336.0, - "76": 1784014336.0, - "77": 1784014336.0, - "78": 1784014336.0, - "79": 1784014336.0, - "80": 1784014336.0, - "81": 1784014336.0, - "82": 1784014336.0, - "83": 1784014336.0, - "84": 1784014336.0, - "85": 1784014336.0, - "86": 1784014336.0, - "87": 1784014336.0, - "88": 1784014336.0, - "89": 1784014336.0, - "90": 1784014336.0, - "91": 1784014336.0, - "92": 1784014336.0, - "93": 1784014336.0, - "94": 1784014336.0, - "95": 1784014336.0, - "96": 1784014336.0, - "97": 1784014336.0, - "98": 1784014336.0, - "99": 1784014336.0, - "100": 1784014336.0 - } - }, - "mem-max-allocated-bytes": { - "start_step": 1, - "end_step": 100, - "step_interval": 1, - "values": { - "1": 2365860864.0, - "2": 3108323328.0, - "3": 3108323328.0, - "4": 3108323328.0, - "5": 3108323328.0, - "6": 3108323328.0, - "7": 3108323328.0, - "8": 3108323328.0, - "9": 3108323328.0, - "10": 3108323328.0, - "11": 3108323328.0, - "12": 3108323328.0, - "13": 3108323328.0, - "14": 3108323328.0, - "15": 3108323328.0, - "16": 3108323328.0, - "17": 3108323328.0, - "18": 3108323328.0, - "19": 3108323328.0, - "20": 3108323328.0, - "21": 3108323328.0, - "22": 3108323328.0, - "23": 3108323328.0, - "24": 3108323328.0, - "25": 3108323328.0, - "26": 3108323328.0, - "27": 3108323328.0, - "28": 3108323328.0, - "29": 3108323328.0, - "30": 3108323328.0, - "31": 3108323328.0, - "32": 3108323328.0, - "33": 3108323328.0, - "34": 3108323328.0, - "35": 3108323328.0, - "36": 3108323328.0, - "37": 3108323328.0, - "38": 3108323328.0, - "39": 3108323328.0, - "40": 3108323328.0, - "41": 3108323328.0, - "42": 3108323328.0, - "43": 3108323328.0, - "44": 3108323328.0, - "45": 3108323328.0, - "46": 3108323328.0, - "47": 3108323328.0, - "48": 3108323328.0, - "49": 3108323328.0, - "50": 3108323328.0, - "51": 3108323328.0, - "52": 3108323328.0, - "53": 3108323328.0, - "54": 3108323328.0, - "55": 3108323328.0, - "56": 3108323328.0, - "57": 3108323328.0, - "58": 3108323328.0, - "59": 3108323328.0, - "60": 3108323328.0, - "61": 3108323328.0, - "62": 3108323328.0, - "63": 3108323328.0, - "64": 3108323328.0, - "65": 3108323328.0, - "66": 3108323328.0, - "67": 3108323328.0, - "68": 3108323328.0, - "69": 3108323328.0, - "70": 3108323328.0, - "71": 3108323328.0, - "72": 3108323328.0, - "73": 3108323328.0, - "74": 3108323328.0, - "75": 3108323328.0, - "76": 3108323328.0, - "77": 3108323328.0, - "78": 3108323328.0, - "79": 3108323328.0, - "80": 3108323328.0, - "81": 3108323328.0, - "82": 3108323328.0, - "83": 3108323328.0, - "84": 3108323328.0, - "85": 3108323328.0, - "86": 3108323328.0, - "87": 3108323328.0, - "88": 3108323328.0, - "89": 3108323328.0, - "90": 3108323328.0, - "91": 3108323328.0, - "92": 3108323328.0, - "93": 3108323328.0, - "94": 3108323328.0, - "95": 3108323328.0, - "96": 3108323328.0, - "97": 3108323328.0, - "98": 3108323328.0, - "99": 3108323328.0, - "100": 3108323328.0 - } - }, - "iteration-time": { - "start_step": 1, - "end_step": 100, - "step_interval": 1, - "values": { - "1": 12.25998, - "2": 1.04599, - "3": 1.00983, - "4": 1.01193, - "5": 1.01326, - "6": 1.01181, - "7": 1.01264, - "8": 1.01822, - "9": 1.02424, - "10": 1.0191, - "11": 1.01303, - "12": 1.00485, - "13": 1.0025, - "14": 1.00999, - "15": 1.00956, - "16": 1.00094, - "17": 1.00769, - "18": 1.01014, - "19": 1.01639, - "20": 1.22304, - "21": 1.4851, - "22": 1.19412, - "23": 1.01165, - "24": 1.0106, - "25": 1.01512, - "26": 1.00595, - "27": 1.01769, - "28": 1.01182, - "29": 1.00676, - "30": 1.00481, - "31": 1.1042, - "32": 1.00908, - "33": 1.01083, - "34": 1.00353, - "35": 1.00454, - "36": 1.00641, - "37": 1.00279, - "38": 1.00471, - "39": 1.00143, - "40": 1.00802, - "41": 1.00755, - "42": 1.00913, - "43": 1.00814, - "44": 1.00935, - "45": 1.00635, - "46": 1.01076, - "47": 1.01077, - "48": 1.14065, - "49": 1.24856, - "50": 1.09012, - "51": 1.03825, - "52": 1.44742, - "53": 1.3184, - "54": 1.01374, - "55": 1.01506, - "56": 1.01099, - "57": 1.04106, - "58": 1.02232, - "59": 1.01748, - "60": 1.00992, - "61": 1.02073, - "62": 1.02809, - "63": 1.34383, - "64": 1.38941, - "65": 1.10673, - "66": 1.01505, - "67": 1.00839, - "68": 1.00645, - "69": 1.01066, - "70": 1.01137, - "71": 1.35475, - "72": 1.02215, - "73": 1.0187, - "74": 1.01939, - "75": 1.10218, - "76": 1.12059, - "77": 1.12057, - "78": 1.03631, - "79": 1.12601, - "80": 1.33494, - "81": 1.09935, - "82": 1.06264, - "83": 1.31187, - "84": 1.0139, - "85": 1.00708, - "86": 1.02816, - "87": 1.02033, - "88": 1.01728, - "89": 1.2628, - "90": 1.01941, - "91": 1.01944, - "92": 1.0295, - "93": 1.01897, - "94": 1.01663, - "95": 1.02386, - "96": 1.00901, - "97": 1.00751, - "98": 1.0074, - "99": 1.00366, - "100": 1.00628 - } - } -} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_resume_torch_dist/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_resume_torch_dist/golden_values_dev_dgxh100_coreweave.json deleted file mode 100644 index 137f195264..0000000000 --- a/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_resume_torch_dist/golden_values_dev_dgxh100_coreweave.json +++ /dev/null @@ -1,537 +0,0 @@ -{ - "lm loss": { - "start_step": 1, - "end_step": 100, - "step_interval": 1, - "values": { - "1": 10.48367, - "2": 10.48426, - "3": 10.48254, - "4": 10.48311, - "5": 10.4764, - "6": 10.4844, - "7": 10.48458, - "8": 10.48829, - "9": 10.49008, - "10": 10.47268, - "11": 10.47256, - "12": 10.48259, - "13": 10.47857, - "14": 10.45154, - "15": 10.47925, - "16": 10.45346, - "17": 10.45145, - "18": 10.46238, - "19": 10.44113, - "20": 10.45448, - "21": 10.43454, - "22": 10.40592, - "23": 10.39961, - "24": 10.37579, - "25": 10.38182, - "26": 10.35147, - "27": 10.35388, - "28": 10.34937, - "29": 10.28711, - "30": 10.21159, - "31": 10.1726, - "32": 10.13421, - "33": 10.14744, - "34": 10.10737, - "35": 10.10581, - "36": 10.08735, - "37": 10.08157, - "38": 10.07233, - "39": 10.00094, - "40": 9.98143, - "41": 9.92541, - "42": 9.87527, - "43": 9.88711, - "44": 9.80642, - "45": 9.82325, - "46": 9.73785, - "47": 9.74817, - "48": 9.71609, - "49": 9.74484, - "50": 9.72982, - "51": 9.71485, - "52": 9.66475, - "53": 9.60919, - "54": 9.62705, - "55": 9.61012, - "56": 9.617, - "57": 9.56786, - "58": 9.52731, - "59": 9.51668, - "60": 9.51865, - "61": 9.53132, - "62": 9.45016, - "63": 9.45725, - "64": 9.43435, - "65": 9.45801, - "66": 9.4368, - "67": 9.3968, - "68": 9.36474, - "69": 9.4095, - "70": 9.376, - "71": 9.41716, - "72": 9.42574, - "73": 9.37581, - "74": 9.41547, - "75": 9.37891, - "76": 9.28017, - "77": 9.32205, - "78": 9.35754, - "79": 9.32162, - "80": 9.31486, - "81": 9.2678, - "82": 9.34178, - "83": 9.32145, - "84": 9.24785, - "85": 9.35023, - "86": 9.22392, - "87": 9.3062, - "88": 9.29891, - "89": 9.22716, - "90": 9.28483, - "91": 9.23109, - "92": 9.27463, - "93": 9.19241, - "94": 9.23984, - "95": 9.28006, - "96": 9.17526, - "97": 9.21894, - "98": 9.17192, - "99": 9.16446, - "100": 9.14816 - } - }, - "num-zeros": { - "start_step": 1, - "end_step": 100, - "step_interval": 1, - "values": { - "1": 2570.0, - "2": 1923.0, - "3": 1512.0, - "4": 2322.0, - "5": 2033.0, - "6": 1774.0, - "7": 2781.0, - "8": 2460.0, - "9": 2308.0, - "10": 2635.0, - "11": 2397.0, - "12": 1817.0, - "13": 2348.0, - "14": 2749.0, - "15": 2027.0, - "16": 2719.0, - "17": 2487.0, - "18": 2533.0, - "19": 2547.0, - "20": 2850.0, - "21": 1990.0, - "22": 2884.0, - "23": 2857.0, - "24": 2685.0, - "25": 2514.0, - "26": 2958.0, - "27": 2673.0, - "28": 2723.0, - "29": 2571.0, - "30": 2858.0, - "31": 2157.0, - "32": 2357.0, - "33": 2242.0, - "34": 2464.0, - "35": 2544.0, - "36": 2933.0, - "37": 3293.0, - "38": 2730.0, - "39": 2795.0, - "40": 3310.0, - "41": 1816.0, - "42": 1467.0, - "43": 1817.0, - "44": 2633.0, - "45": 3576.0, - "46": 3015.0, - "47": 2805.0, - "48": 3071.0, - "49": 2974.0, - "50": 2267.0, - "51": 1923.0, - "52": 2515.0, - "53": 3615.0, - "54": 3426.0, - "55": 3436.0, - "56": 4411.0, - "57": 4095.0, - "58": 4308.0, - "59": 1687.0, - "60": 2431.0, - "61": 2151.0, - "62": 3986.0, - "63": 3558.0, - "64": 4286.0, - "65": 3052.0, - "66": 1720.0, - "67": 1910.0, - "68": 4193.0, - "69": 4347.0, - "70": 4596.0, - "71": 2078.0, - "72": 4406.0, - "73": 4062.0, - "74": 3358.0, - "75": 4606.0, - "76": 2187.0, - "77": 4854.0, - "78": 4098.0, - "79": 2652.0, - "80": 3776.0, - "81": 3550.0, - "82": 3031.0, - "83": 5345.0, - "84": 4396.0, - "85": 4354.0, - "86": 3332.0, - "87": 4815.0, - "88": 3303.0, - "89": 4611.0, - "90": 4346.0, - "91": 4361.0, - "92": 3502.0, - "93": 5624.0, - "94": 3733.0, - "95": 4728.0, - "96": 3534.0, - "97": 3873.0, - "98": 4525.0, - "99": 4329.0, - "100": 3365.0 - } - }, - "mem-allocated-bytes": { - "start_step": 1, - "end_step": 100, - "step_interval": 1, - "values": { - "1": 1784014336.0, - "2": 1784014336.0, - "3": 1784014336.0, - "4": 1784014336.0, - "5": 1784014336.0, - "6": 1784014336.0, - "7": 1784014336.0, - "8": 1784014336.0, - "9": 1784014336.0, - "10": 1784014336.0, - "11": 1784014336.0, - "12": 1784014336.0, - "13": 1784014336.0, - "14": 1784014336.0, - "15": 1784014336.0, - "16": 1784014336.0, - "17": 1784014336.0, - "18": 1784014336.0, - "19": 1784014336.0, - "20": 1784014336.0, - "21": 1784014336.0, - "22": 1784014336.0, - "23": 1784014336.0, - "24": 1784014336.0, - "25": 1784014336.0, - "26": 1784014336.0, - "27": 1784014336.0, - "28": 1784014336.0, - "29": 1784014336.0, - "30": 1784014336.0, - "31": 1784014336.0, - "32": 1784014336.0, - "33": 1784014336.0, - "34": 1784014336.0, - "35": 1784014336.0, - "36": 1784014336.0, - "37": 1784014336.0, - "38": 1784014336.0, - "39": 1784014336.0, - "40": 1784014336.0, - "41": 1784014336.0, - "42": 1784014336.0, - "43": 1784014336.0, - "44": 1784014336.0, - "45": 1784014336.0, - "46": 1784014336.0, - "47": 1784014336.0, - "48": 1784014336.0, - "49": 1784014336.0, - "50": 1784014336.0, - "51": 1784014336.0, - "52": 1784014336.0, - "53": 1784014336.0, - "54": 1784014336.0, - "55": 1784014336.0, - "56": 1784014336.0, - "57": 1784014336.0, - "58": 1784014336.0, - "59": 1784014336.0, - "60": 1784014336.0, - "61": 1784014336.0, - "62": 1784014336.0, - "63": 1784014336.0, - "64": 1784014336.0, - "65": 1784014336.0, - "66": 1784014336.0, - "67": 1784014336.0, - "68": 1784014336.0, - "69": 1784014336.0, - "70": 1784014336.0, - "71": 1784014336.0, - "72": 1784014336.0, - "73": 1784014336.0, - "74": 1784014336.0, - "75": 1784014336.0, - "76": 1784014336.0, - "77": 1784014336.0, - "78": 1784014336.0, - "79": 1784014336.0, - "80": 1784014336.0, - "81": 1784014336.0, - "82": 1784014336.0, - "83": 1784014336.0, - "84": 1784014336.0, - "85": 1784014336.0, - "86": 1784014336.0, - "87": 1784014336.0, - "88": 1784014336.0, - "89": 1784014336.0, - "90": 1784014336.0, - "91": 1784014336.0, - "92": 1784014336.0, - "93": 1784014336.0, - "94": 1784014336.0, - "95": 1784014336.0, - "96": 1784014336.0, - "97": 1784014336.0, - "98": 1784014336.0, - "99": 1784014336.0, - "100": 1784014336.0 - } - }, - "mem-max-allocated-bytes": { - "start_step": 1, - "end_step": 100, - "step_interval": 1, - "values": { - "1": 2365860864.0, - "2": 3108323328.0, - "3": 3108323328.0, - "4": 3108323328.0, - "5": 3108323328.0, - "6": 3108323328.0, - "7": 3108323328.0, - "8": 3108323328.0, - "9": 3108323328.0, - "10": 3108845568.0, - "11": 3108845568.0, - "12": 3108845568.0, - "13": 3108845568.0, - "14": 3108845568.0, - "15": 3108845568.0, - "16": 3108845568.0, - "17": 3108845568.0, - "18": 3108845568.0, - "19": 3108845568.0, - "20": 3108845568.0, - "21": 3108845568.0, - "22": 3108845568.0, - "23": 3108845568.0, - "24": 3108845568.0, - "25": 3108845568.0, - "26": 3108845568.0, - "27": 3108845568.0, - "28": 3108845568.0, - "29": 3108845568.0, - "30": 3108845568.0, - "31": 3108845568.0, - "32": 3108845568.0, - "33": 3108845568.0, - "34": 3108845568.0, - "35": 3108845568.0, - "36": 3108845568.0, - "37": 3108846080.0, - "38": 3108846080.0, - "39": 3108846080.0, - "40": 3108846080.0, - "41": 3108846080.0, - "42": 3108846080.0, - "43": 3108846080.0, - "44": 3108846080.0, - "45": 3108846080.0, - "46": 3108846080.0, - "47": 3108846080.0, - "48": 3108846080.0, - "49": 3108846080.0, - "50": 3108846080.0, - "51": 3108846080.0, - "52": 3108846080.0, - "53": 3108846080.0, - "54": 3108846080.0, - "55": 3108846080.0, - "56": 3108846080.0, - "57": 3108846080.0, - "58": 3108846080.0, - "59": 3108846080.0, - "60": 3108846080.0, - "61": 3108846080.0, - "62": 3108847616.0, - "63": 3108847616.0, - "64": 3108847616.0, - "65": 3108847616.0, - "66": 3108847616.0, - "67": 3108847616.0, - "68": 3108847616.0, - "69": 3108847616.0, - "70": 3108847616.0, - "71": 3108847616.0, - "72": 3108847616.0, - "73": 3108847616.0, - "74": 3108847616.0, - "75": 3108847616.0, - "76": 3108847616.0, - "77": 3108847616.0, - "78": 3108847616.0, - "79": 3108847616.0, - "80": 3108847616.0, - "81": 3108847616.0, - "82": 3108847616.0, - "83": 3108847616.0, - "84": 3108847616.0, - "85": 3108847616.0, - "86": 3108847616.0, - "87": 3108847616.0, - "88": 3108847616.0, - "89": 3108847616.0, - "90": 3108847616.0, - "91": 3108847616.0, - "92": 3108847616.0, - "93": 3108847616.0, - "94": 3108847616.0, - "95": 3108847616.0, - "96": 3108847616.0, - "97": 3108847616.0, - "98": 3108847616.0, - "99": 3108847616.0, - "100": 3108847616.0 - } - }, - "iteration-time": { - "start_step": 1, - "end_step": 100, - "step_interval": 1, - "values": { - "1": 13.09913, - "2": 1.02984, - "3": 0.9509, - "4": 0.92961, - "5": 0.88057, - "6": 0.86499, - "7": 0.87435, - "8": 0.87748, - "9": 0.88481, - "10": 0.87813, - "11": 0.88937, - "12": 0.91092, - "13": 0.85441, - "14": 0.87519, - "15": 0.89434, - "16": 1.08771, - "17": 0.87461, - "18": 0.8785, - "19": 1.08419, - "20": 1.00138, - "21": 0.98051, - "22": 1.32806, - "23": 0.85982, - "24": 0.88387, - "25": 0.88245, - "26": 0.87335, - "27": 0.88317, - "28": 0.88985, - "29": 0.895, - "30": 0.87281, - "31": 0.88109, - "32": 0.87358, - "33": 0.89681, - "34": 0.91049, - "35": 0.89763, - "36": 0.89169, - "37": 0.89357, - "38": 0.89732, - "39": 0.88241, - "40": 0.90292, - "41": 0.88715, - "42": 0.90721, - "43": 1.00024, - "44": 1.05261, - "45": 0.88589, - "46": 0.89065, - "47": 1.19824, - "48": 1.03763, - "49": 0.88362, - "50": 2.54681, - "51": 0.88554, - "52": 1.29624, - "53": 0.90469, - "54": 1.25859, - "55": 0.8959, - "56": 0.89223, - "57": 0.91307, - "58": 0.9046, - "59": 0.90217, - "60": 1.19764, - "61": 0.96385, - "62": 1.26273, - "63": 1.00365, - "64": 0.95065, - "65": 0.87723, - "66": 0.87675, - "67": 0.8752, - "68": 1.1677, - "69": 0.87584, - "70": 0.88581, - "71": 1.19607, - "72": 0.88789, - "73": 1.11276, - "74": 0.89256, - "75": 0.8887, - "76": 1.28091, - "77": 0.93746, - "78": 0.87892, - "79": 1.07934, - "80": 0.88837, - "81": 0.87726, - "82": 0.87655, - "83": 0.89632, - "84": 0.90579, - "85": 0.88535, - "86": 0.8924, - "87": 0.8763, - "88": 0.8769, - "89": 0.87952, - "90": 0.89745, - "91": 0.8736, - "92": 0.8825, - "93": 0.8845, - "94": 0.87495, - "95": 0.88075, - "96": 0.94076, - "97": 0.87753, - "98": 0.88407, - "99": 0.89106, - "100": 0.88092 - } - } -} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_resume_torch_dist/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_resume_torch_dist/golden_values_dev_dgxh100_eos.json deleted file mode 100644 index dc5d31f8f8..0000000000 --- a/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_resume_torch_dist/golden_values_dev_dgxh100_eos.json +++ /dev/null @@ -1,537 +0,0 @@ -{ - "lm loss": { - "start_step": 1, - "end_step": 100, - "step_interval": 1, - "values": { - "1": 10.48367, - "2": 10.48426, - "3": 10.48254, - "4": 10.48311, - "5": 10.4764, - "6": 10.4844, - "7": 10.48458, - "8": 10.48829, - "9": 10.49008, - "10": 10.47268, - "11": 10.47256, - "12": 10.48259, - "13": 10.47857, - "14": 10.45154, - "15": 10.47925, - "16": 10.45346, - "17": 10.45145, - "18": 10.46238, - "19": 10.44113, - "20": 10.45448, - "21": 10.43454, - "22": 10.40592, - "23": 10.39961, - "24": 10.37579, - "25": 10.38182, - "26": 10.35147, - "27": 10.35388, - "28": 10.34937, - "29": 10.28711, - "30": 10.21159, - "31": 10.1726, - "32": 10.13421, - "33": 10.14744, - "34": 10.10737, - "35": 10.10581, - "36": 10.08735, - "37": 10.08157, - "38": 10.07233, - "39": 10.00094, - "40": 9.98143, - "41": 9.92541, - "42": 9.87527, - "43": 9.88711, - "44": 9.80642, - "45": 9.82325, - "46": 9.73785, - "47": 9.74817, - "48": 9.71609, - "49": 9.74484, - "50": 9.72982, - "51": 9.71485, - "52": 9.66475, - "53": 9.60919, - "54": 9.62705, - "55": 9.61012, - "56": 9.617, - "57": 9.56786, - "58": 9.52731, - "59": 9.51668, - "60": 9.51865, - "61": 9.53132, - "62": 9.45016, - "63": 9.45725, - "64": 9.43435, - "65": 9.45801, - "66": 9.4368, - "67": 9.3968, - "68": 9.36474, - "69": 9.4095, - "70": 9.376, - "71": 9.41716, - "72": 9.42574, - "73": 9.37581, - "74": 9.41547, - "75": 9.37891, - "76": 9.28017, - "77": 9.32205, - "78": 9.35754, - "79": 9.32162, - "80": 9.31486, - "81": 9.2678, - "82": 9.34178, - "83": 9.32145, - "84": 9.24785, - "85": 9.35023, - "86": 9.22392, - "87": 9.3062, - "88": 9.29891, - "89": 9.22716, - "90": 9.28483, - "91": 9.23109, - "92": 9.27463, - "93": 9.19241, - "94": 9.23984, - "95": 9.28006, - "96": 9.17526, - "97": 9.21894, - "98": 9.17192, - "99": 9.16446, - "100": 9.14816 - } - }, - "num-zeros": { - "start_step": 1, - "end_step": 100, - "step_interval": 1, - "values": { - "1": 2570.0, - "2": 1923.0, - "3": 1512.0, - "4": 2322.0, - "5": 2033.0, - "6": 1774.0, - "7": 2781.0, - "8": 2460.0, - "9": 2308.0, - "10": 2635.0, - "11": 2397.0, - "12": 1817.0, - "13": 2348.0, - "14": 2749.0, - "15": 2027.0, - "16": 2719.0, - "17": 2487.0, - "18": 2533.0, - "19": 2547.0, - "20": 2850.0, - "21": 1990.0, - "22": 2884.0, - "23": 2857.0, - "24": 2685.0, - "25": 2514.0, - "26": 2958.0, - "27": 2673.0, - "28": 2723.0, - "29": 2571.0, - "30": 2858.0, - "31": 2157.0, - "32": 2357.0, - "33": 2242.0, - "34": 2464.0, - "35": 2544.0, - "36": 2933.0, - "37": 3293.0, - "38": 2730.0, - "39": 2795.0, - "40": 3310.0, - "41": 1816.0, - "42": 1467.0, - "43": 1817.0, - "44": 2633.0, - "45": 3576.0, - "46": 3015.0, - "47": 2805.0, - "48": 3071.0, - "49": 2974.0, - "50": 2267.0, - "51": 1923.0, - "52": 2515.0, - "53": 3615.0, - "54": 3426.0, - "55": 3436.0, - "56": 4411.0, - "57": 4095.0, - "58": 4308.0, - "59": 1687.0, - "60": 2431.0, - "61": 2151.0, - "62": 3986.0, - "63": 3558.0, - "64": 4286.0, - "65": 3052.0, - "66": 1720.0, - "67": 1910.0, - "68": 4193.0, - "69": 4347.0, - "70": 4596.0, - "71": 2078.0, - "72": 4406.0, - "73": 4062.0, - "74": 3358.0, - "75": 4606.0, - "76": 2187.0, - "77": 4854.0, - "78": 4098.0, - "79": 2652.0, - "80": 3776.0, - "81": 3550.0, - "82": 3031.0, - "83": 5345.0, - "84": 4396.0, - "85": 4354.0, - "86": 3332.0, - "87": 4815.0, - "88": 3303.0, - "89": 4611.0, - "90": 4346.0, - "91": 4361.0, - "92": 3502.0, - "93": 5624.0, - "94": 3733.0, - "95": 4728.0, - "96": 3534.0, - "97": 3873.0, - "98": 4525.0, - "99": 4329.0, - "100": 3365.0 - } - }, - "mem-allocated-bytes": { - "start_step": 1, - "end_step": 100, - "step_interval": 1, - "values": { - "1": 1784014336.0, - "2": 1784014336.0, - "3": 1784014336.0, - "4": 1784014336.0, - "5": 1784014336.0, - "6": 1784014336.0, - "7": 1784014336.0, - "8": 1784014336.0, - "9": 1784014336.0, - "10": 1784014336.0, - "11": 1784014336.0, - "12": 1784014336.0, - "13": 1784014336.0, - "14": 1784014336.0, - "15": 1784014336.0, - "16": 1784014336.0, - "17": 1784014336.0, - "18": 1784014336.0, - "19": 1784014336.0, - "20": 1784014336.0, - "21": 1784014336.0, - "22": 1784014336.0, - "23": 1784014336.0, - "24": 1784014336.0, - "25": 1784014336.0, - "26": 1784014336.0, - "27": 1784014336.0, - "28": 1784014336.0, - "29": 1784014336.0, - "30": 1784014336.0, - "31": 1784014336.0, - "32": 1784014336.0, - "33": 1784014336.0, - "34": 1784014336.0, - "35": 1784014336.0, - "36": 1784014336.0, - "37": 1784014336.0, - "38": 1784014336.0, - "39": 1784014336.0, - "40": 1784014336.0, - "41": 1784014336.0, - "42": 1784014336.0, - "43": 1784014336.0, - "44": 1784014336.0, - "45": 1784014336.0, - "46": 1784014336.0, - "47": 1784014336.0, - "48": 1784014336.0, - "49": 1784014336.0, - "50": 1784014336.0, - "51": 1784014336.0, - "52": 1784014336.0, - "53": 1784014336.0, - "54": 1784014336.0, - "55": 1784014336.0, - "56": 1784014336.0, - "57": 1784014336.0, - "58": 1784014336.0, - "59": 1784014336.0, - "60": 1784014336.0, - "61": 1784014336.0, - "62": 1784014336.0, - "63": 1784014336.0, - "64": 1784014336.0, - "65": 1784014336.0, - "66": 1784014336.0, - "67": 1784014336.0, - "68": 1784014336.0, - "69": 1784014336.0, - "70": 1784014336.0, - "71": 1784014336.0, - "72": 1784014336.0, - "73": 1784014336.0, - "74": 1784014336.0, - "75": 1784014336.0, - "76": 1784014336.0, - "77": 1784014336.0, - "78": 1784014336.0, - "79": 1784014336.0, - "80": 1784014336.0, - "81": 1784014336.0, - "82": 1784014336.0, - "83": 1784014336.0, - "84": 1784014336.0, - "85": 1784014336.0, - "86": 1784014336.0, - "87": 1784014336.0, - "88": 1784014336.0, - "89": 1784014336.0, - "90": 1784014336.0, - "91": 1784014336.0, - "92": 1784014336.0, - "93": 1784014336.0, - "94": 1784014336.0, - "95": 1784014336.0, - "96": 1784014336.0, - "97": 1784014336.0, - "98": 1784014336.0, - "99": 1784014336.0, - "100": 1784014336.0 - } - }, - "mem-max-allocated-bytes": { - "start_step": 1, - "end_step": 100, - "step_interval": 1, - "values": { - "1": 2365860864.0, - "2": 3108323328.0, - "3": 3108323328.0, - "4": 3108323328.0, - "5": 3108323328.0, - "6": 3108323328.0, - "7": 3108323328.0, - "8": 3108323328.0, - "9": 3108323328.0, - "10": 3108323328.0, - "11": 3108323328.0, - "12": 3108323328.0, - "13": 3108323328.0, - "14": 3108323328.0, - "15": 3108323328.0, - "16": 3108323328.0, - "17": 3108323328.0, - "18": 3108323328.0, - "19": 3108323328.0, - "20": 3108323328.0, - "21": 3108323328.0, - "22": 3108323328.0, - "23": 3108323328.0, - "24": 3108323328.0, - "25": 3108323328.0, - "26": 3108323328.0, - "27": 3108323328.0, - "28": 3108323328.0, - "29": 3108323328.0, - "30": 3108323328.0, - "31": 3108323328.0, - "32": 3108323328.0, - "33": 3108323328.0, - "34": 3108323328.0, - "35": 3108323328.0, - "36": 3108323328.0, - "37": 3108323328.0, - "38": 3108323328.0, - "39": 3108323328.0, - "40": 3108323328.0, - "41": 3108323328.0, - "42": 3108323328.0, - "43": 3108323328.0, - "44": 3108323328.0, - "45": 3108323328.0, - "46": 3108323328.0, - "47": 3108323328.0, - "48": 3108323328.0, - "49": 3108323328.0, - "50": 3108323328.0, - "51": 3108323328.0, - "52": 3108323328.0, - "53": 3108323328.0, - "54": 3108323328.0, - "55": 3108323328.0, - "56": 3108323328.0, - "57": 3108842496.0, - "58": 3108842496.0, - "59": 3108842496.0, - "60": 3108842496.0, - "61": 3108842496.0, - "62": 3108842496.0, - "63": 3108842496.0, - "64": 3108842496.0, - "65": 3108842496.0, - "66": 3108842496.0, - "67": 3108842496.0, - "68": 3108842496.0, - "69": 3108842496.0, - "70": 3108842496.0, - "71": 3108842496.0, - "72": 3108842496.0, - "73": 3108842496.0, - "74": 3108842496.0, - "75": 3108844544.0, - "76": 3108844544.0, - "77": 3108844544.0, - "78": 3108844544.0, - "79": 3108844544.0, - "80": 3108844544.0, - "81": 3108844544.0, - "82": 3108844544.0, - "83": 3108844544.0, - "84": 3108844544.0, - "85": 3108844544.0, - "86": 3108844544.0, - "87": 3108844544.0, - "88": 3108844544.0, - "89": 3108844544.0, - "90": 3108844544.0, - "91": 3108844544.0, - "92": 3108844544.0, - "93": 3108844544.0, - "94": 3108844544.0, - "95": 3108844544.0, - "96": 3108844544.0, - "97": 3108844544.0, - "98": 3108844544.0, - "99": 3108844544.0, - "100": 3108844544.0 - } - }, - "iteration-time": { - "start_step": 1, - "end_step": 100, - "step_interval": 1, - "values": { - "1": 11.84806, - "2": 1.03522, - "3": 1.00793, - "4": 1.00939, - "5": 1.00929, - "6": 1.01517, - "7": 1.01009, - "8": 1.01561, - "9": 1.02131, - "10": 1.01787, - "11": 1.01149, - "12": 1.0128, - "13": 1.01358, - "14": 1.01768, - "15": 1.23565, - "16": 1.01096, - "17": 1.19479, - "18": 1.01674, - "19": 1.01808, - "20": 1.23016, - "21": 1.01908, - "22": 1.11536, - "23": 1.0888, - "24": 1.02965, - "25": 1.03972, - "26": 1.00766, - "27": 1.00981, - "28": 1.01339, - "29": 1.01801, - "30": 1.01655, - "31": 1.01796, - "32": 1.01286, - "33": 1.01823, - "34": 1.00604, - "35": 1.01493, - "36": 1.01106, - "37": 1.00783, - "38": 1.01573, - "39": 1.01525, - "40": 1.09842, - "41": 1.39919, - "42": 1.22658, - "43": 1.00841, - "44": 0.99932, - "45": 1.00156, - "46": 1.18473, - "47": 1.01528, - "48": 1.00768, - "49": 1.00498, - "50": 0.9957, - "51": 1.29149, - "52": 1.10051, - "53": 1.00264, - "54": 1.00531, - "55": 1.30558, - "56": 0.99836, - "57": 1.00645, - "58": 1.00413, - "59": 1.00106, - "60": 1.00076, - "61": 1.32205, - "62": 1.00795, - "63": 1.2523, - "64": 1.01369, - "65": 1.01151, - "66": 1.01484, - "67": 1.00831, - "68": 1.01849, - "69": 1.01821, - "70": 1.01316, - "71": 1.01068, - "72": 1.01792, - "73": 1.47417, - "74": 1.01143, - "75": 1.14077, - "76": 1.01286, - "77": 1.08819, - "78": 1.01005, - "79": 1.0069, - "80": 1.01196, - "81": 1.0882, - "82": 1.00417, - "83": 1.29479, - "84": 1.0044, - "85": 1.0103, - "86": 1.00862, - "87": 1.01863, - "88": 1.2549, - "89": 1.0075, - "90": 1.00874, - "91": 1.0111, - "92": 1.01049, - "93": 1.01084, - "94": 1.01043, - "95": 1.01246, - "96": 1.01317, - "97": 1.09821, - "98": 1.01406, - "99": 1.00578, - "100": 1.09442 - } - } -} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_resume_torch_dist/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_resume_torch_dist/golden_values_lts_dgx_a100.json deleted file mode 100644 index 4d76420ccf..0000000000 --- a/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_resume_torch_dist/golden_values_lts_dgx_a100.json +++ /dev/null @@ -1 +0,0 @@ -{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.49574, "5": 10.48398, "10": 10.49943, "15": 10.4663, "20": 10.44775, "25": 10.34954, "30": 10.17283, "35": 10.0427, "40": 9.9076, "45": 9.7577, "50": 9.67688, "55": 9.55375, "60": 9.4546, "65": 9.42141, "70": 9.30109, "75": 9.32202, "80": 9.26199, "85": 9.29667, "90": 9.2334, "95": 9.23801, "100": 9.10601}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 2182.0, "5": 2568.0, "10": 2108.0, "15": 2533.0, "20": 2166.0, "25": 2639.0, "30": 2769.0, "35": 3080.0, "40": 2282.0, "45": 3831.0, "50": 3519.0, "55": 3692.0, "60": 2614.0, "65": 3344.0, "70": 4018.0, "75": 4983.0, "80": 3679.0, "85": 4082.0, "90": 4634.0, "95": 4487.0, "100": 3079.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1754654208.0, "5": 1754654208.0, "10": 1754654208.0, "15": 1754654208.0, "20": 1754654208.0, "25": 1754654208.0, "30": 1754654208.0, "35": 1754654208.0, "40": 1754654208.0, "45": 1754654208.0, "50": 1754654208.0, "55": 1754654208.0, "60": 1754654208.0, "65": 1754654208.0, "70": 1754654208.0, "75": 1755702784.0, "80": 1754654208.0, "85": 1754654208.0, "90": 1754654208.0, "95": 1754654208.0, "100": 1754654208.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 2300849152.0, "5": 3043311616.0, "10": 3043311616.0, "15": 3043311616.0, "20": 3043311616.0, "25": 3043311616.0, "30": 3043311616.0, "35": 3043311616.0, "40": 3043311616.0, "45": 3043311616.0, "50": 3043311616.0, "55": 3043311616.0, "60": 3043311616.0, "65": 3043311616.0, "70": 3043311616.0, "75": 3043311616.0, "80": 3043311616.0, "85": 3043311616.0, "90": 3043311616.0, "95": 3043311616.0, "100": 3043311616.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 13.04253, "5": 1.18237, "10": 1.19035, "15": 1.15172, "20": 1.13338, "25": 1.24747, "30": 1.14325, "35": 1.14281, "40": 1.15392, "45": 1.16568, "50": 1.16303, "55": 1.18009, "60": 1.17624, "65": 1.17621, "70": 1.1845, "75": 1.19129, "80": 1.19627, "85": 1.18614, "90": 1.18685, "95": 1.20386, "100": 1.40621}}} \ No newline at end of file diff --git a/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_resume_torch_dist/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_resume_torch_dist/model_config.yaml deleted file mode 100644 index 4913568412..0000000000 --- a/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_resume_torch_dist/model_config.yaml +++ /dev/null @@ -1,47 +0,0 @@ -ENV_VARS: - CUDA_DEVICE_MAX_CONNECTIONS: 1 - NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 - NCCL_ALGO: Ring - CUBLAS_WORKSPACE_CONFIG: :4096:8 -MODEL_ARGS: - --num-layers: 24 - --hidden-size: 1024 - --num-attention-heads: 16 - --log-params-norm: true - --log-num-zeros-in-grad: true - --log-validation-ppl-to-tensorboard: true - --log-timers-to-tensorboard: true - --log-memory-to-tensorboard: true - --tensorboard-dir: ${TENSORBOARD_PATH} - --micro-batch-size: 4 - --global-batch-size: 128 - --seq-length: 512 - --max-position-embeddings: 512 - --train-iters: 100 - --timing-log-level: 0 - --lr-decay-iters: 990000 - --save: ${CHECKPOINT_SAVE_PATH} - --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/text/the_pile/bert_shard00/my-bert_00_text_sentence - --vocab-file: ${DATA_PATH}/text/the_pile/bert_shard00/vocab.txt - --split: 949,50,1 - --distributed-backend: nccl - --lr: 0.0001 - --min-lr: 0.00001 - --lr-warmup-fraction: 0.01 - --log-interval: 1 - --save-interval: 50 - --eval-interval: 1000 - --eval-iters: 10 - --tensor-model-parallel-size: 2 - --pipeline-model-parallel-size: 2 - --deterministic-mode: true - --use-checkpoint-args: true - --use-checkpoint-opt_param-scheduler: true - --no-gradient-accumulation-fusion: true - --data-cache-path: ${DATA_CACHE_PATH} - --bf16: true - --ckpt-format: torch - --dist-ckpt-strictness: log_all # backward compatibility for TE changes - --attention-backend: unfused -TEST_TYPE: ckpt-resume diff --git a/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_resume_torch_dist_local_spec/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_resume_torch_dist_local_spec/model_config.yaml deleted file mode 100644 index 6c0dc55051..0000000000 --- a/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_resume_torch_dist_local_spec/model_config.yaml +++ /dev/null @@ -1,48 +0,0 @@ -ENV_VARS: - CUDA_DEVICE_MAX_CONNECTIONS: 1 - NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 - NCCL_ALGO: Ring - CUBLAS_WORKSPACE_CONFIG: :4096:8 -MODEL_ARGS: - --num-layers: 24 - --hidden-size: 1024 - --num-attention-heads: 16 - --log-params-norm: true - --log-num-zeros-in-grad: true - --log-validation-ppl-to-tensorboard: true - --log-timers-to-tensorboard: true - --log-memory-to-tensorboard: true - --tensorboard-dir: ${TENSORBOARD_PATH} - --micro-batch-size: 4 - --global-batch-size: 128 - --seq-length: 512 - --max-position-embeddings: 512 - --train-iters: 100 - --timing-log-level: 0 - --lr-decay-iters: 990000 - --save: ${CHECKPOINT_SAVE_PATH} - --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/text/the_pile/bert_shard00/my-bert_00_text_sentence - --vocab-file: ${DATA_PATH}/text/the_pile/bert_shard00/vocab.txt - --split: 949,50,1 - --distributed-backend: nccl - --lr: 0.0001 - --min-lr: 0.00001 - --lr-warmup-fraction: 0.01 - --log-interval: 1 - --save-interval: 50 - --eval-interval: 1000 - --eval-iters: 10 - --tensor-model-parallel-size: 2 - --pipeline-model-parallel-size: 2 - --spec: local - --deterministic-mode: true - --use-checkpoint-args: true - --use-checkpoint-opt_param-scheduler: true - --no-gradient-accumulation-fusion: true - --dist-ckpt-strictness: log_all # backward compatibility for TE changes - --data-cache-path: ${DATA_CACHE_PATH} - --bf16: true - --ckpt-format: torch - --attention-backend: local -TEST_TYPE: ckpt-resume diff --git a/tests/functional_tests/test_cases/bert/bert_mcore_tp4_pp1/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_mcore_tp4_pp1/model_config.yaml deleted file mode 100644 index e001ea4dc0..0000000000 --- a/tests/functional_tests/test_cases/bert/bert_mcore_tp4_pp1/model_config.yaml +++ /dev/null @@ -1,44 +0,0 @@ -ENV_VARS: - CUDA_DEVICE_MAX_CONNECTIONS: 1 - NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 - NCCL_ALGO: Ring - CUBLAS_WORKSPACE_CONFIG: :4096:8 -MODEL_ARGS: - --num-layers: 24 - --hidden-size: 1024 - --num-attention-heads: 16 - --log-params-norm: true - --log-num-zeros-in-grad: true - --log-validation-ppl-to-tensorboard: true - --log-timers-to-tensorboard: true - --log-memory-to-tensorboard: true - --tensorboard-dir: ${TENSORBOARD_PATH} - --micro-batch-size: 4 - --global-batch-size: 128 - --seq-length: 512 - --max-position-embeddings: 512 - --train-iters: 50 - --timing-log-level: 0 - --lr-decay-iters: 990000 - --save: ${CHECKPOINT_SAVE_PATH} - --load: ${CHECKPOINT_LOAD_PATH} - --data-path: ${DATA_PATH}/text/the_pile/bert_shard00/my-bert_00_text_sentence - --vocab-file: ${DATA_PATH}/text/the_pile/bert_shard00/vocab.txt - --split: 949,50,1 - --distributed-backend: nccl - --lr: 0.0001 - --min-lr: 0.00001 - --lr-warmup-fraction: 0.01 - --log-interval: 1 - --save-interval: 10000 - --eval-interval: 1000 - --eval-iters: 10 - --tensor-model-parallel-size: 4 - --pipeline-model-parallel-size: 1 - --deterministic-mode: true - --no-gradient-accumulation-fusion: true - --data-cache-path: ${DATA_CACHE_PATH} - --bf16: true - --ckpt-format: torch - --attention-backend: unfused -TEST_TYPE: regular diff --git a/tests/functional_tests/test_cases/bert/main.py b/tests/functional_tests/test_cases/bert/main.py new file mode 100644 index 0000000000..d893f5e2ba --- /dev/null +++ b/tests/functional_tests/test_cases/bert/main.py @@ -0,0 +1,203 @@ +import os +import sys +from typing import Any, Dict, List + +import yaml +from yaml.constructor import ConstructorError +from yaml.nodes import MappingNode, ScalarNode + +# --- Pass 1: Placeholders and Custom Loader --- + + +class Reference: + """A placeholder object for a !reference tag.""" + + def __init__(self, keys: List[str]): + self.keys = keys + + def __repr__(self): + return f"Reference(keys={self.keys})" + + +class ConfigLoader(yaml.SafeLoader): + """ + Custom YAML Loader that handles !include, creates + placeholders for !reference, and defers merge (<<) resolution. + """ + + def __init__(self, stream): + try: + self._root = os.path.dirname(stream.name) + except AttributeError: + self._root = os.path.abspath('.') + super().__init__(stream) + + def construct_mapping(self, node: MappingNode, deep: bool = False) -> Dict[Any, Any]: + """ + Override default mapping constructor to *not* call flatten_mapping. + This prevents PyYAML from trying to resolve '<<' in Pass 1. + """ + if not isinstance(node, MappingNode): + raise ConstructorError( + None, None, "expected a mapping node, but found %s" % node.id, node.start_mark + ) + + mapping = {} + for key_node, value_node in node.value: + key = self.construct_object(key_node, deep=deep) + try: + hash(key) + except TypeError as exc: + raise ConstructorError( + "while constructing a mapping", + node.start_mark, + "found unhashable key (%s)" % exc, + key_node.start_mark, + ) + + if key in mapping: + raise ConstructorError( + "while constructing a mapping", + node.start_mark, + "found duplicate key (%s)" % key, + key_node.start_mark, + ) + + value = self.construct_object(value_node, deep=deep) + mapping[key] = value + return mapping + + def construct_merge(self, node: ScalarNode) -> str: + """ + Handles the 'tag:yaml.org,2002:merge' tag (the '<<' key) + by just treating it as a plain string. + """ + return self.construct_scalar(node) + + +def include_constructor(loader: ConfigLoader, node: yaml.Node) -> Any: + """Handles !include by recursively loading with the same loader.""" + filename = os.path.join(loader._root, loader.construct_scalar(node)) + if not os.path.isfile(filename): + raise FileNotFoundError(f"Included file not found: {filename}") + + with open(filename, 'r') as f: + return yaml.load(f, Loader=type(loader)) + + +def reference_constructor(loader: ConfigLoader, node: yaml.Node) -> Reference: + """Handles !reference by creating a Reference placeholder.""" + keys = loader.construct_sequence(node) + return Reference(keys) + + +# Register all custom constructors with our loader +yaml.add_constructor('!include', include_constructor, Loader=ConfigLoader) +yaml.add_constructor('!reference', reference_constructor, Loader=ConfigLoader) +yaml.add_constructor('tag:yaml.org,2002:merge', ConfigLoader.construct_merge, Loader=ConfigLoader) + + +# --- Pass 2: Resolver Functions --- + + +def _lookup(keys: List[str], data: Dict[str, Any]) -> Any: + """Helper to look up a value from a nested dict via a key path.""" + current = data + for k in keys: + current = current[k] + return current + + +def resolve_refs(node: Any, root: Dict[str, Any]) -> Any: + """ + Recursively traverses the data structure, resolving + Reference placeholders and manually handling '<<' merges. + """ + # 1. Resolve a Reference placeholder + if isinstance(node, Reference): + # Look up the value and *recursively resolve it* + # in case it's another Reference or contains one. + found_val = _lookup(node.keys, root) + return resolve_refs(found_val, root) + + # 2. Recurse into a list + if isinstance(node, list): + return [resolve_refs(item, root) for item in node] + + # 3. Recurse into a dict (and handle merges) + if isinstance(node, dict): + new_dict = {} + + # Handle the YAML merge key '<<' first. + if '<<' in node: + # Resolve the merge source (which could be a Reference) + merge_source = resolve_refs(node['<<'], root) + + if isinstance(merge_source, dict): + # Must resolve the *contents* of the merged dict too + new_dict.update(resolve_refs(merge_source, root)) + elif isinstance(merge_source, list): + for d in merge_source: + if not isinstance(d, dict): + raise TypeError(f"YAML merge '<<' list item not a dict: {type(d)}") + new_dict.update(resolve_refs(d, root)) + elif merge_source is not None: + raise TypeError( + f"YAML merge key '<<' resolved to invalid type: {type(merge_source)}" + ) + + # Process/override with the rest of the keys + for key, value in node.items(): + if key == '<<': + continue + new_dict[key] = resolve_refs(value, root) + + return new_dict + + # 4. It's a primitive, return as-is + return node + + +# --- Main Execution --- + + +def load_config(main_config_path: str) -> Dict[str, Any]: + """ + Loads, parses, and fully resolves the two-pass YAML config. + """ + try: + # --- Pass 1: Load with custom loader --- + print("--- Running Pass 1 (Loading) ---") + pass1_data = None + with open(main_config_path, 'r') as f: + pass1_data = yaml.load(f, Loader=ConfigLoader) + + print("Result after Pass 1 (with placeholders):") + print(pass1_data) + + # --- Pass 2: Resolve references --- + print("\n--- Running Pass 2 (Resolving) ---") + # The 'root' for resolution is the entire data structure itself. + final_data = resolve_refs(pass1_data, pass1_data) + + print("\n--- Final, fully resolved data ---") + return final_data + + except Exception as e: + print(f"\nAn error occurred while parsing YAML: {e}", file=sys.stderr) + # Depending on your app, you might want to re-raise or sys.exit + raise + + +if __name__ == "__main__": + # --- Create dummy files for a self-contained test --- + # (You would remove this and just call load_config in your real app) + + # This is the main function call + + # Pretty-print the final result + print( + yaml.dump( + load_config('tp1_pp2/model_config.yaml'), default_flow_style=False, sort_keys=False + ) + ) diff --git a/tests/functional_tests/test_cases/bert/bert_mcore_tp1_pp2/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/bert/tp1_pp2/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/bert/bert_mcore_tp1_pp2/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/bert/tp1_pp2/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/bert/bert_mcore_tp1_pp2/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/bert/tp1_pp2/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/bert/bert_mcore_tp1_pp2/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/bert/tp1_pp2/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/bert/bert_mcore_tp1_pp2/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/bert/tp1_pp2/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/bert/bert_mcore_tp1_pp2/golden_values_dev_dgxh100_coreweave.json rename to tests/functional_tests/test_cases/bert/tp1_pp2/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/bert/bert_mcore_tp1_pp2/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/bert/tp1_pp2/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/bert/bert_mcore_tp1_pp2/golden_values_dev_dgxh100_eos.json rename to tests/functional_tests/test_cases/bert/tp1_pp2/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/bert/bert_mcore_tp1_pp2/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/bert/tp1_pp2/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/bert/bert_mcore_tp1_pp2/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/bert/tp1_pp2/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/bert/tp1_pp2/model_config.yaml b/tests/functional_tests/test_cases/bert/tp1_pp2/model_config.yaml new file mode 100644 index 0000000000..80ba5f4d23 --- /dev/null +++ b/tests/functional_tests/test_cases/bert/tp1_pp2/model_config.yaml @@ -0,0 +1,10 @@ +.model_config: !include ../bert_mcore.yaml + +ENV_VARS: !reference [.model_config, ENV_VARS] + +MODEL_ARGS: + <<: !reference [.model_config, MODEL_ARGS] + --tensor-model-parallel-size: 1 + --pipeline-model-parallel-size: 2 + +TEST_TYPE: !reference [.model_config, TEST_TYPE] diff --git a/tests/functional_tests/test_cases/bert/bert_mcore_tp1_pp4_vp2/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/bert/tp1_pp4_vp2_legacy_tokenizer/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/bert/bert_mcore_tp1_pp4_vp2/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/bert/tp1_pp4_vp2_legacy_tokenizer/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/bert/bert_mcore_tp1_pp4_vp2/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/bert/tp1_pp4_vp2_legacy_tokenizer/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/bert/bert_mcore_tp1_pp4_vp2/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/bert/tp1_pp4_vp2_legacy_tokenizer/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/bert/bert_mcore_tp1_pp4_vp2/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/bert/tp1_pp4_vp2_legacy_tokenizer/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/bert/bert_mcore_tp1_pp4_vp2/golden_values_dev_dgxh100_coreweave.json rename to tests/functional_tests/test_cases/bert/tp1_pp4_vp2_legacy_tokenizer/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/bert/bert_mcore_tp1_pp4_vp2/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/bert/tp1_pp4_vp2_legacy_tokenizer/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/bert/bert_mcore_tp1_pp4_vp2/golden_values_dev_dgxh100_eos.json rename to tests/functional_tests/test_cases/bert/tp1_pp4_vp2_legacy_tokenizer/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/bert/bert_mcore_tp1_pp4_vp2/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/bert/tp1_pp4_vp2_legacy_tokenizer/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/bert/bert_mcore_tp1_pp4_vp2/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/bert/tp1_pp4_vp2_legacy_tokenizer/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/bert/tp1_pp4_vp2_legacy_tokenizer/model_config.yaml b/tests/functional_tests/test_cases/bert/tp1_pp4_vp2_legacy_tokenizer/model_config.yaml new file mode 100644 index 0000000000..3eb2cb0c1e --- /dev/null +++ b/tests/functional_tests/test_cases/bert/tp1_pp4_vp2_legacy_tokenizer/model_config.yaml @@ -0,0 +1,12 @@ +.model_config: !include ../bert_mcore.yaml + +ENV_VARS: !reference [.model_config, ENV_VARS] + +MODEL_ARGS: + <<: !reference [.model_config, MODEL_ARGS] + --tensor-model-parallel-size: 1 + --pipeline-model-parallel-size: 4 + --num-layers-per-virtual-pipeline-stage: 2 + --legacy-tokenizer: true + +TEST_TYPE: !reference [.model_config, TEST_TYPE] diff --git a/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/bert/tp2_pp2/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/bert/tp2_pp2/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/bert/tp2_pp2/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/bert/tp2_pp2/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/bert/tp2_pp2/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2/golden_values_dev_dgxh100_coreweave.json rename to tests/functional_tests/test_cases/bert/tp2_pp2/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/bert/tp2_pp2/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2/golden_values_dev_dgxh100_eos.json rename to tests/functional_tests/test_cases/bert/tp2_pp2/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/bert/tp2_pp2/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/bert/tp2_pp2/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/bert/tp2_pp2/model_config.yaml b/tests/functional_tests/test_cases/bert/tp2_pp2/model_config.yaml new file mode 100644 index 0000000000..14c7789551 --- /dev/null +++ b/tests/functional_tests/test_cases/bert/tp2_pp2/model_config.yaml @@ -0,0 +1,10 @@ +.model_config: !include ../bert_mcore.yaml + +ENV_VARS: !reference [.model_config, ENV_VARS] + +MODEL_ARGS: + <<: !reference [.model_config, MODEL_ARGS] + --tensor-model-parallel-size: 2 + --pipeline-model-parallel-size: 2 + +TEST_TYPE: !reference [.model_config, TEST_TYPE] diff --git a/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_resume_torch_dist_local_spec/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/bert/tp2_pp2_local_spec/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_resume_torch_dist_local_spec/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/bert/tp2_pp2_local_spec/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_resume_torch_dist_local_spec/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/bert/tp2_pp2_local_spec/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_resume_torch_dist_local_spec/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/bert/tp2_pp2_local_spec/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_resume_torch_dist_local_spec/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/bert/tp2_pp2_local_spec/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_resume_torch_dist_local_spec/golden_values_dev_dgxh100_coreweave.json rename to tests/functional_tests/test_cases/bert/tp2_pp2_local_spec/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_resume_torch_dist_local_spec/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/bert/tp2_pp2_local_spec/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_resume_torch_dist_local_spec/golden_values_dev_dgxh100_eos.json rename to tests/functional_tests/test_cases/bert/tp2_pp2_local_spec/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_resume_torch_dist_local_spec/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/bert/tp2_pp2_local_spec/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/bert/bert_mcore_tp2_pp2_resume_torch_dist_local_spec/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/bert/tp2_pp2_local_spec/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/bert/tp2_pp2_local_spec/model_config.yaml b/tests/functional_tests/test_cases/bert/tp2_pp2_local_spec/model_config.yaml new file mode 100644 index 0000000000..ae03660309 --- /dev/null +++ b/tests/functional_tests/test_cases/bert/tp2_pp2_local_spec/model_config.yaml @@ -0,0 +1,14 @@ +.model_config: !include ../bert_mcore.yaml + +ENV_VARS: !reference [.model_config, ENV_VARS] + +MODEL_ARGS: + <<: !reference [.model_config, MODEL_ARGS] + --tensor-model-parallel-size: 2 + --pipeline-model-parallel-size: 2 + --spec: local + --use-checkpoint-args: true + --use-checkpoint-opt_param-scheduler: true + --attention-backend: local + +TEST_TYPE: !reference [.model_config, TEST_TYPE] diff --git a/tests/functional_tests/test_cases/bert/bert_mcore_tp4_pp1/golden_values_dev_dgx_a100.json b/tests/functional_tests/test_cases/bert/tp4_pp1/golden_values_dev_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/bert/bert_mcore_tp4_pp1/golden_values_dev_dgx_a100.json rename to tests/functional_tests/test_cases/bert/tp4_pp1/golden_values_dev_dgx_a100.json diff --git a/tests/functional_tests/test_cases/bert/bert_mcore_tp4_pp1/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/bert/tp4_pp1/golden_values_dev_dgx_h100.json similarity index 100% rename from tests/functional_tests/test_cases/bert/bert_mcore_tp4_pp1/golden_values_dev_dgx_h100.json rename to tests/functional_tests/test_cases/bert/tp4_pp1/golden_values_dev_dgx_h100.json diff --git a/tests/functional_tests/test_cases/bert/bert_mcore_tp4_pp1/golden_values_dev_dgxh100_coreweave.json b/tests/functional_tests/test_cases/bert/tp4_pp1/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/bert/bert_mcore_tp4_pp1/golden_values_dev_dgxh100_coreweave.json rename to tests/functional_tests/test_cases/bert/tp4_pp1/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/bert/bert_mcore_tp4_pp1/golden_values_dev_dgxh100_eos.json b/tests/functional_tests/test_cases/bert/tp4_pp1/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/bert/bert_mcore_tp4_pp1/golden_values_dev_dgxh100_eos.json rename to tests/functional_tests/test_cases/bert/tp4_pp1/golden_values_dev_dgxh100_eos.json diff --git a/tests/functional_tests/test_cases/bert/bert_mcore_tp4_pp1/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/bert/tp4_pp1/golden_values_lts_dgx_a100.json similarity index 100% rename from tests/functional_tests/test_cases/bert/bert_mcore_tp4_pp1/golden_values_lts_dgx_a100.json rename to tests/functional_tests/test_cases/bert/tp4_pp1/golden_values_lts_dgx_a100.json diff --git a/tests/functional_tests/test_cases/bert/tp4_pp1/model_config.yaml b/tests/functional_tests/test_cases/bert/tp4_pp1/model_config.yaml new file mode 100644 index 0000000000..f71a1894c4 --- /dev/null +++ b/tests/functional_tests/test_cases/bert/tp4_pp1/model_config.yaml @@ -0,0 +1,10 @@ +.model_config: !include ../bert_mcore.yaml + +ENV_VARS: !reference [.model_config, ENV_VARS] + +MODEL_ARGS: + <<: !reference [.model_config, MODEL_ARGS] + --tensor-model-parallel-size: 4 + --pipeline-model-parallel-size: 1 + +TEST_TYPE: !reference [.model_config, TEST_TYPE]