From 1219a263351c8e0c738e9ed1df3c7cfcfae44212 Mon Sep 17 00:00:00 2001 From: Hongbin Liu Date: Wed, 29 Oct 2025 02:46:14 -0700 Subject: [PATCH 01/10] renaming golden values Signed-off-by: Hongbin Liu --- ...ev_coreweave.json => golden_values_dev_dgxh100_coreweave.json} | 0 ...den_values_dev_eos.json => golden_values_dev_dgxh100_eos.json} | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/{golden_values_dev_coreweave.json => golden_values_dev_dgxh100_coreweave.json} (100%) rename tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/{golden_values_dev_eos.json => golden_values_dev_dgxh100_eos.json} (100%) diff --git a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/golden_values_dev_coreweave.json b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/golden_values_dev_dgxh100_coreweave.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/golden_values_dev_coreweave.json rename to tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/golden_values_dev_dgxh100_coreweave.json diff --git a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/golden_values_dev_eos.json b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/golden_values_dev_dgxh100_eos.json similarity index 100% rename from tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/golden_values_dev_eos.json rename to tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/golden_values_dev_dgxh100_eos.json From ce6e6613b99465aa936ce8b80913c4eeeed337c0 Mon Sep 17 00:00:00 2001 From: Hongbin Liu Date: Tue, 4 Nov 2025 00:26:16 -0800 Subject: [PATCH 02/10] fix bug: accuracy issu because of recomputing and offloading same module Signed-off-by: Hongbin Liu --- megatron/core/tensor_parallel/random.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/megatron/core/tensor_parallel/random.py b/megatron/core/tensor_parallel/random.py index 5a44c38713..69e973142d 100644 --- a/megatron/core/tensor_parallel/random.py +++ b/megatron/core/tensor_parallel/random.py @@ -579,6 +579,14 @@ def _recompute(self, _): # Store the inputs for backward pass inputs = self.ctx.saved_tensors + def detach(t): + if isinstance(t, torch.Tensor): + requires_grad = t.requires_grad + t = t.detach() + t.requires_grad_(requires_grad) + return t + + inputs = tuple(detach(t) for t in inputs) with torch.enable_grad(), fp8_ctx, recompute_ctx: outputs = self.run_function(*inputs) From 2fe4aebf90dd0f82cc612281690fd59a55bf71b4 Mon Sep 17 00:00:00 2001 From: Hongbin Liu Date: Tue, 4 Nov 2025 00:33:06 -0800 Subject: [PATCH 03/10] format Signed-off-by: Hongbin Liu --- megatron/core/tensor_parallel/random.py | 1 + 1 file changed, 1 insertion(+) diff --git a/megatron/core/tensor_parallel/random.py b/megatron/core/tensor_parallel/random.py index 69e973142d..396e5c54a2 100644 --- a/megatron/core/tensor_parallel/random.py +++ b/megatron/core/tensor_parallel/random.py @@ -579,6 +579,7 @@ def _recompute(self, _): # Store the inputs for backward pass inputs = self.ctx.saved_tensors + def detach(t): if isinstance(t, torch.Tensor): requires_grad = t.requires_grad From fb3f7c38b93ffb51d5a1da2e1c54c5e812ad408f Mon Sep 17 00:00:00 2001 From: Hongbin Liu Date: Tue, 4 Nov 2025 19:49:47 -0800 Subject: [PATCH 04/10] update golden values Signed-off-by: Hongbin Liu --- .../golden_values_dev_dgx_h100.json | 592 +++++++++--------- .../golden_values_dev_dgx_h100.json | 392 ++++++------ 2 files changed, 492 insertions(+), 492 deletions(-) diff --git a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/golden_values_dev_dgx_h100.json index 0a6724a3e9..4b32d4256d 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/golden_values_dev_dgx_h100.json @@ -6,54 +6,54 @@ "values": { "1": 11.07559, "2": 11.03834, - "3": 9.66022, - "4": 9.91367, - "5": 9.3291, - "6": 9.13927, - "7": 9.13591, - "8": 8.65527, - "9": 8.51396, - "10": 8.84095, - "11": 8.29144, - "12": 8.34584, - "13": 8.25509, - "14": 7.73685, - "15": 7.86273, - "16": 7.93699, - "17": 7.89257, - "18": 7.63116, - "19": 7.99719, - "20": 7.7453, - "21": 7.44298, - "22": 7.42242, - "23": 7.29721, - "24": 7.27467, - "25": 7.54562, - "26": 6.96839, - "27": 7.50569, - "28": 7.22761, - "29": 7.36579, - "30": 7.52635, - "31": 7.27036, - "32": 7.45548, - "33": 7.50952, - "34": 7.55694, - "35": 7.10212, - "36": 6.96414, - "37": 7.28438, - "38": 7.08049, - "39": 7.40908, - "40": 7.4335, - "41": 7.38491, - "42": 7.15766, - "43": 7.15867, - "44": 7.28831, - "45": 7.16729, - "46": 6.78429, - "47": 7.40937, - "48": 7.00259, - "49": 7.46241, - "50": 6.92143 + "3": 9.72869, + "4": 9.61678, + "5": 10.63323, + "6": 9.1681, + "7": 9.35196, + "8": 9.05204, + "9": 8.84148, + "10": 9.00321, + "11": 8.49799, + "12": 8.5218, + "13": 8.41649, + "14": 7.9096, + "15": 8.00627, + "16": 8.05394, + "17": 8.0203, + "18": 7.73136, + "19": 8.11676, + "20": 7.83945, + "21": 7.52196, + "22": 7.5295, + "23": 7.38729, + "24": 7.3758, + "25": 7.65255, + "26": 7.04795, + "27": 7.591, + "28": 7.30023, + "29": 7.45656, + "30": 7.60935, + "31": 7.3713, + "32": 7.55298, + "33": 7.59738, + "34": 7.65764, + "35": 7.17916, + "36": 7.04913, + "37": 7.38022, + "38": 7.14883, + "39": 7.50321, + "40": 7.51595, + "41": 7.45139, + "42": 7.21197, + "43": 7.21131, + "44": 7.38058, + "45": 7.16397, + "46": 6.86108, + "47": 7.27247, + "48": 7.10862, + "49": 7.56398, + "50": 7.00523 } }, "num-zeros": { @@ -61,56 +61,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 911219392.0, - "2": 910960384.0, - "3": 911156288.0, - "4": 913253376.0, - "5": 921845056.0, - "6": 941436672.0, - "7": 993745472.0, - "8": 974360512.0, - "9": 999146112.0, - "10": 992706944.0, - "11": 991438144.0, - "12": 979442048.0, - "13": 1029190272.0, - "14": 1008214656.0, - "15": 988472000.0, - "16": 988861120.0, - "17": 979173312.0, - "18": 996164608.0, - "19": 979453440.0, - "20": 982914688.0, - "21": 975473344.0, - "22": 955037568.0, - "23": 969208128.0, - "24": 965840832.0, - "25": 953269440.0, - "26": 949025536.0, - "27": 948458304.0, - "28": 951741184.0, - "29": 943926272.0, - "30": 935020160.0, - "31": 933230336.0, - "32": 930086848.0, - "33": 922853952.0, - "34": 927140800.0, - "35": 925348224.0, - "36": 925295168.0, - "37": 922758272.0, - "38": 922930752.0, - "39": 922322880.0, - "40": 921856640.0, - "41": 920227776.0, - "42": 918353664.0, - "43": 919655616.0, - "44": 914948224.0, - "45": 916392512.0, - "46": 914344448.0, - "47": 911769536.0, - "48": 912013248.0, - "49": 910349376.0, - "50": 914351616.0 + "1": 38802120.0, + "2": 38543052.0, + "3": 38738396.0, + "4": 113220144.0, + "5": 344100160.0, + "6": 435062816.0, + "7": 579598912.0, + "8": 819195200.0, + "9": 604910464.0, + "10": 690749824.0, + "11": 744002496.0, + "12": 520212192.0, + "13": 547932992.0, + "14": 585659584.0, + "15": 614149184.0, + "16": 664915328.0, + "17": 592272320.0, + "18": 630225856.0, + "19": 579959808.0, + "20": 800470080.0, + "21": 573941056.0, + "22": 557652032.0, + "23": 797256640.0, + "24": 826380864.0, + "25": 814860160.0, + "26": 617708032.0, + "27": 715680384.0, + "28": 548045824.0, + "29": 736312064.0, + "30": 722163456.0, + "31": 711986176.0, + "32": 674238208.0, + "33": 715239232.0, + "34": 677588288.0, + "35": 473423392.0, + "36": 451352800.0, + "37": 446739392.0, + "38": 567466304.0, + "39": 472519552.0, + "40": 434322048.0, + "41": 554276096.0, + "42": 526187424.0, + "43": 510713152.0, + "44": 522783808.0, + "45": 335511072.0, + "46": 450878784.0, + "47": 450397344.0, + "48": 321720704.0, + "49": 437443680.0, + "50": 419425088.0 } }, "mem-allocated-bytes": { @@ -118,56 +118,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 5498353152.0, - "2": 5499147776.0, - "3": 5499940352.0, - "4": 5500732928.0, - "5": 5501525504.0, - "6": 5502318080.0, - "7": 5503110656.0, - "8": 5503903232.0, - "9": 5497958912.0, - "10": 5498751488.0, - "11": 5499544064.0, - "12": 5500336640.0, - "13": 5501129216.0, - "14": 5501921792.0, - "15": 5502714368.0, - "16": 5503506944.0, - "17": 5504299520.0, - "18": 5505092096.0, - "19": 5505884672.0, - "20": 5506677248.0, - "21": 5507469824.0, - "22": 5508262400.0, - "23": 5509054976.0, - "24": 5509847552.0, - "25": 5510640128.0, - "26": 5511432704.0, - "27": 5512225280.0, - "28": 5513017856.0, - "29": 5513810432.0, - "30": 5514603008.0, - "31": 5515395584.0, - "32": 5516188160.0, - "33": 5516980736.0, - "34": 5517773312.0, - "35": 5518565888.0, - "36": 5519358464.0, - "37": 5520151040.0, - "38": 5520943616.0, - "39": 5521736192.0, - "40": 5522528768.0, - "41": 5523321344.0, - "42": 5524113920.0, - "43": 5524906496.0, - "44": 5525699072.0, - "45": 5526491648.0, - "46": 5527284224.0, - "47": 5528076800.0, - "48": 5528869376.0, - "49": 5529661952.0, - "50": 5530454528.0 + "1": 5498340864.0, + "2": 5499135488.0, + "3": 5499928064.0, + "4": 5500720640.0, + "5": 5501513216.0, + "6": 5502305792.0, + "7": 5497946624.0, + "8": 5498739200.0, + "9": 5499531776.0, + "10": 5500324352.0, + "11": 5501116928.0, + "12": 5498342912.0, + "13": 5499135488.0, + "14": 5499928064.0, + "15": 5500720640.0, + "16": 5501513216.0, + "17": 5502305792.0, + "18": 5503098368.0, + "19": 5503890944.0, + "20": 5504683520.0, + "21": 5505476096.0, + "22": 5506268672.0, + "23": 5507061248.0, + "24": 5507853824.0, + "25": 5508646400.0, + "26": 5509438976.0, + "27": 5510231552.0, + "28": 5511024128.0, + "29": 5511816704.0, + "30": 5512609280.0, + "31": 5513401856.0, + "32": 5514194432.0, + "33": 5514987008.0, + "34": 5515779584.0, + "35": 5516572160.0, + "36": 5517364736.0, + "37": 5518157312.0, + "38": 5518949888.0, + "39": 5519742464.0, + "40": 5520535040.0, + "41": 5521327616.0, + "42": 5522120192.0, + "43": 5522912768.0, + "44": 5523705344.0, + "45": 5524497920.0, + "46": 5525290496.0, + "47": 5526083072.0, + "48": 5526875648.0, + "49": 5527668224.0, + "50": 5528460800.0 } }, "mem-max-allocated-bytes": { @@ -175,56 +175,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 41740259328.0, - "2": 43687292928.0, - "3": 43687292928.0, - "4": 43984064512.0, - "5": 43984064512.0, - "6": 43984064512.0, - "7": 43984064512.0, - "8": 44026380288.0, - "9": 44041506816.0, - "10": 44041506816.0, - "11": 44041506816.0, - "12": 44041506816.0, - "13": 44041506816.0, - "14": 44041506816.0, - "15": 44041506816.0, - "16": 44041506816.0, - "17": 44041506816.0, - "18": 44041506816.0, - "19": 44041506816.0, - "20": 44041506816.0, - "21": 44041506816.0, - "22": 44041506816.0, - "23": 44041506816.0, - "24": 44041506816.0, - "25": 44041506816.0, - "26": 44041506816.0, - "27": 44041506816.0, - "28": 44041506816.0, - "29": 44044173312.0, - "30": 44164231168.0, - "31": 44221079552.0, - "32": 44271415296.0, - "33": 44290232320.0, - "34": 44290232320.0, - "35": 44290232320.0, - "36": 44290232320.0, - "37": 44290232320.0, - "38": 44290232320.0, - "39": 44290232320.0, - "40": 44290232320.0, - "41": 44290232320.0, - "42": 44290232320.0, - "43": 44290232320.0, - "44": 44290232320.0, - "45": 44290232320.0, - "46": 44290232320.0, - "47": 44290232320.0, - "48": 44290232320.0, - "49": 44290232320.0, - "50": 44290232320.0 + "1": 41723441152.0, + "2": 43687280640.0, + "3": 43916578816.0, + "4": 43916578816.0, + "5": 43916578816.0, + "6": 43916578816.0, + "7": 43916578816.0, + "8": 43916578816.0, + "9": 43916578816.0, + "10": 43916578816.0, + "11": 43916578816.0, + "12": 44028436480.0, + "13": 44028436480.0, + "14": 44028436480.0, + "15": 44028436480.0, + "16": 44028436480.0, + "17": 44028436480.0, + "18": 44028436480.0, + "19": 44028436480.0, + "20": 44028436480.0, + "21": 44028436480.0, + "22": 44028436480.0, + "23": 44028436480.0, + "24": 44028436480.0, + "25": 44028436480.0, + "26": 44028436480.0, + "27": 44028436480.0, + "28": 44028436480.0, + "29": 44028436480.0, + "30": 44028436480.0, + "31": 44028436480.0, + "32": 44028436480.0, + "33": 44028436480.0, + "34": 44028436480.0, + "35": 44028436480.0, + "36": 44028436480.0, + "37": 44028436480.0, + "38": 44028436480.0, + "39": 44028436480.0, + "40": 44028436480.0, + "41": 44028436480.0, + "42": 44028436480.0, + "43": 44028436480.0, + "44": 44028436480.0, + "45": 44028436480.0, + "46": 44028436480.0, + "47": 44028436480.0, + "48": 44028436480.0, + "49": 44028436480.0, + "50": 44028436480.0 } }, "mtp_1 loss": { @@ -234,54 +234,54 @@ "values": { "1": 11.08623, "2": 11.1047, - "3": 10.47999, - "4": 10.13471, - "5": 9.79045, - "6": 9.50607, - "7": 9.51139, - "8": 8.85331, - "9": 8.66688, - "10": 8.95867, - "11": 8.29318, - "12": 8.36986, - "13": 8.25545, - "14": 7.73323, - "15": 7.86639, - "16": 7.92438, - "17": 7.86276, - "18": 7.61004, - "19": 8.00261, - "20": 7.73004, - "21": 7.41636, - "22": 7.41466, - "23": 7.28656, - "24": 7.27882, - "25": 7.54458, - "26": 6.96533, - "27": 7.5053, - "28": 7.20603, - "29": 7.37687, - "30": 7.52783, - "31": 7.27097, - "32": 7.46043, - "33": 7.51419, - "34": 7.56879, - "35": 7.09276, - "36": 6.96019, - "37": 7.29843, - "38": 7.07417, - "39": 7.43338, - "40": 7.43134, - "41": 7.40946, - "42": 7.15527, - "43": 7.15684, - "44": 7.30429, - "45": 7.18917, - "46": 6.77286, - "47": 7.44985, - "48": 7.02383, - "49": 7.4572, - "50": 6.92645 + "3": 10.54469, + "4": 10.08474, + "5": 9.76549, + "6": 9.56242, + "7": 9.59473, + "8": 8.97686, + "9": 8.83293, + "10": 9.1193, + "11": 8.44318, + "12": 8.49593, + "13": 8.37985, + "14": 7.81516, + "15": 7.95146, + "16": 8.01718, + "17": 7.94503, + "18": 7.68603, + "19": 8.07501, + "20": 7.79558, + "21": 7.46867, + "22": 7.46603, + "23": 7.32734, + "24": 7.32819, + "25": 7.58465, + "26": 6.99257, + "27": 7.53486, + "28": 7.23432, + "29": 7.40501, + "30": 7.55005, + "31": 7.30085, + "32": 7.48028, + "33": 7.53593, + "34": 7.60112, + "35": 7.12344, + "36": 6.99007, + "37": 7.32578, + "38": 7.09623, + "39": 7.45759, + "40": 7.45018, + "41": 7.40101, + "42": 7.14459, + "43": 7.13995, + "44": 7.32066, + "45": 7.0966, + "46": 6.80106, + "47": 7.21219, + "48": 7.05021, + "49": 7.48165, + "50": 6.95118 } }, "iteration-time": { @@ -289,56 +289,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 85.92313, - "2": 1.99152, - "3": 3.91366, - "4": 1.68454, - "5": 2.53883, - "6": 2.55539, - "7": 1.60104, - "8": 1.70562, - "9": 1.72325, - "10": 1.4332, - "11": 1.07958, - "12": 1.399, - "13": 1.10259, - "14": 1.43922, - "15": 1.12046, - "16": 1.33695, - "17": 1.24765, - "18": 1.11257, - "19": 1.10335, - "20": 1.12919, - "21": 1.27711, - "22": 1.09482, - "23": 1.27635, - "24": 1.112, - "25": 1.17791, - "26": 1.10426, - "27": 1.09103, - "28": 1.08338, - "29": 1.07904, - "30": 1.08709, - "31": 1.2237, - "32": 1.18059, - "33": 1.07913, - "34": 1.17232, - "35": 1.09059, - "36": 1.09648, - "37": 1.12683, - "38": 1.10153, - "39": 1.09557, - "40": 1.07747, - "41": 1.12905, - "42": 1.09275, - "43": 1.08609, - "44": 1.08042, - "45": 1.08321, - "46": 1.0732, - "47": 1.08666, - "48": 1.08865, - "49": 1.08808, - "50": 1.08086 + "1": 87.66203, + "2": 2.04189, + "3": 3.34278, + "4": 3.72414, + "5": 3.23492, + "6": 1.94546, + "7": 2.14942, + "8": 1.78075, + "9": 1.06029, + "10": 2.13554, + "11": 1.42578, + "12": 1.80986, + "13": 1.06134, + "14": 1.087, + "15": 1.16687, + "16": 1.20412, + "17": 1.06984, + "18": 1.07557, + "19": 1.04081, + "20": 1.21763, + "21": 1.06196, + "22": 1.14038, + "23": 2.25761, + "24": 1.09161, + "25": 1.04319, + "26": 1.40025, + "27": 1.04974, + "28": 1.03984, + "29": 1.05293, + "30": 1.48942, + "31": 1.04785, + "32": 1.0529, + "33": 1.04366, + "34": 1.0633, + "35": 1.0713, + "36": 1.05711, + "37": 1.08085, + "38": 1.07006, + "39": 1.06498, + "40": 1.05913, + "41": 1.0697, + "42": 1.079, + "43": 1.14122, + "44": 1.06478, + "45": 1.04692, + "46": 1.08174, + "47": 1.07595, + "48": 1.10523, + "49": 1.0839, + "50": 1.07754 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/golden_values_dev_dgx_h100.json index eca2cabaca..f3ef464697 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/golden_values_dev_dgx_h100.json @@ -6,54 +6,54 @@ "values": { "1": 11.04276, "2": 11.02298, - "3": 9.43542, - "4": 10.04672, - "5": 9.38572, - "6": 9.14547, - "7": 9.21155, - "8": 8.63445, - "9": 8.48944, - "10": 8.82764, - "11": 8.29479, - "12": 8.32819, - "13": 8.23003, - "14": 7.71724, - "15": 7.86963, - "16": 7.9228, - "17": 7.86049, - "18": 7.62035, - "19": 7.9851, - "20": 7.72027, - "21": 7.39754, - "22": 7.39767, - "23": 7.28334, - "24": 7.25057, - "25": 7.53131, - "26": 6.95335, - "27": 7.49421, - "28": 7.20415, - "29": 7.373, - "30": 7.50279, - "31": 7.25342, - "32": 7.43069, - "33": 7.48385, - "34": 7.53476, - "35": 7.10325, - "36": 6.94471, - "37": 7.26141, - "38": 7.07026, - "39": 7.40536, - "40": 7.42025, - "41": 7.34194, - "42": 7.11724, - "43": 7.11421, - "44": 7.27077, - "45": 7.0701, - "46": 6.77811, - "47": 7.18895, - "48": 7.00013, - "49": 7.45875, - "50": 6.90988 + "3": 9.50921, + "4": 10.86244, + "5": 9.36127, + "6": 9.05636, + "7": 9.20064, + "8": 8.98909, + "9": 8.67001, + "10": 9.00892, + "11": 8.50716, + "12": 8.45579, + "13": 8.41197, + "14": 7.92802, + "15": 7.99663, + "16": 8.04156, + "17": 8.06453, + "18": 7.73746, + "19": 8.09946, + "20": 7.85555, + "21": 7.54063, + "22": 7.51142, + "23": 7.39766, + "24": 7.36551, + "25": 7.63399, + "26": 7.04934, + "27": 7.60084, + "28": 7.30223, + "29": 7.47164, + "30": 7.61428, + "31": 7.34981, + "32": 7.53935, + "33": 7.59164, + "34": 7.64951, + "35": 7.18657, + "36": 7.03804, + "37": 7.36778, + "38": 7.14613, + "39": 7.50644, + "40": 7.51103, + "41": 7.44582, + "42": 7.20666, + "43": 7.2123, + "44": 7.37723, + "45": 7.17293, + "46": 6.86188, + "47": 7.2648, + "48": 7.1069, + "49": 7.56115, + "50": 7.00113 } }, "num-zeros": { @@ -61,56 +61,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 844114112.0, - "2": 843855296.0, - "3": 844048640.0, - "4": 842998208.0, - "5": 855786112.0, - "6": 878524160.0, - "7": 924542976.0, - "8": 917741504.0, - "9": 932042112.0, - "10": 930847360.0, - "11": 954742400.0, - "12": 922824128.0, - "13": 968378816.0, - "14": 965228416.0, - "15": 951776640.0, - "16": 941679424.0, - "17": 929894336.0, - "18": 928011136.0, - "19": 955339264.0, - "20": 987111232.0, - "21": 924095488.0, - "22": 906805504.0, - "23": 895810432.0, - "24": 902927680.0, - "25": 927056960.0, - "26": 879821440.0, - "27": 911759744.0, - "28": 902460416.0, - "29": 872625216.0, - "30": 865815744.0, - "31": 868220352.0, - "32": 865076800.0, - "33": 864135552.0, - "34": 855839104.0, - "35": 854046784.0, - "36": 855042176.0, - "37": 850408192.0, - "38": 850580480.0, - "39": 849972608.0, - "40": 849505792.0, - "41": 845780352.0, - "42": 846003392.0, - "43": 848354688.0, - "44": 850986496.0, - "45": 848236160.0, - "46": 855625856.0, - "47": 843613312.0, - "48": 851197312.0, - "49": 851630464.0, - "50": 846195968.0 + "1": 38808176.0, + "2": 38549232.0, + "3": 38741780.0, + "4": 78604016.0, + "5": 152229680.0, + "6": 299762016.0, + "7": 557587712.0, + "8": 589584384.0, + "9": 482229120.0, + "10": 517739584.0, + "11": 526962624.0, + "12": 476182528.0, + "13": 667453056.0, + "14": 563635200.0, + "15": 592125760.0, + "16": 589362048.0, + "17": 453879424.0, + "18": 444631456.0, + "19": 532791520.0, + "20": 677797248.0, + "21": 545577920.0, + "22": 494731040.0, + "23": 551928576.0, + "24": 489800928.0, + "25": 644993344.0, + "26": 441532864.0, + "27": 467175040.0, + "28": 431687840.0, + "29": 409185824.0, + "30": 583756032.0, + "31": 592451072.0, + "32": 416290048.0, + "33": 391230880.0, + "34": 325273120.0, + "35": 350756576.0, + "36": 331801376.0, + "37": 349196160.0, + "38": 312664800.0, + "39": 419015584.0, + "40": 299035872.0, + "41": 274307296.0, + "42": 296551584.0, + "43": 381740640.0, + "44": 308872480.0, + "45": 263141648.0, + "46": 353360864.0, + "47": 271093472.0, + "48": 346833600.0, + "49": 267589936.0, + "50": 252702768.0 } }, "mem-allocated-bytes": { @@ -177,54 +177,54 @@ "values": { "1": 37959917568.0, "2": 39578673152.0, - "3": 39580192768.0, - "4": 39580192768.0, - "5": 39583301632.0, - "6": 39583301632.0, - "7": 39583301632.0, - "8": 39583301632.0, - "9": 39583301632.0, - "10": 39583301632.0, - "11": 39583301632.0, - "12": 39583301632.0, - "13": 39583301632.0, - "14": 39583301632.0, - "15": 39583301632.0, - "16": 39583301632.0, - "17": 39583301632.0, - "18": 39583301632.0, - "19": 39583301632.0, - "20": 39583301632.0, - "21": 39583301632.0, - "22": 39583301632.0, - "23": 39583301632.0, - "24": 39583301632.0, - "25": 39583301632.0, - "26": 39583301632.0, - "27": 39583301632.0, - "28": 39583301632.0, - "29": 39583301632.0, - "30": 39583301632.0, - "31": 39583301632.0, - "32": 39583301632.0, - "33": 39583301632.0, - "34": 39583301632.0, - "35": 39583301632.0, - "36": 39583301632.0, - "37": 39583301632.0, - "38": 39583301632.0, - "39": 39583301632.0, - "40": 39583301632.0, - "41": 39583301632.0, - "42": 39583301632.0, - "43": 39583301632.0, - "44": 39583301632.0, - "45": 39583301632.0, - "46": 39583301632.0, - "47": 39583301632.0, - "48": 39583301632.0, - "49": 39583301632.0, - "50": 39583301632.0 + "3": 39583825920.0, + "4": 39583825920.0, + "5": 39586181120.0, + "6": 39586181120.0, + "7": 39586181120.0, + "8": 39586181120.0, + "9": 39586181120.0, + "10": 39586181120.0, + "11": 39586181120.0, + "12": 39586181120.0, + "13": 39586181120.0, + "14": 39586181120.0, + "15": 39586181120.0, + "16": 39586181120.0, + "17": 39586181120.0, + "18": 39586181120.0, + "19": 39586181120.0, + "20": 39586181120.0, + "21": 39586181120.0, + "22": 39586181120.0, + "23": 39586181120.0, + "24": 39586181120.0, + "25": 39586181120.0, + "26": 39586181120.0, + "27": 39586181120.0, + "28": 39586181120.0, + "29": 39586181120.0, + "30": 39586181120.0, + "31": 39586181120.0, + "32": 39586181120.0, + "33": 39586181120.0, + "34": 39586181120.0, + "35": 39586181120.0, + "36": 39586181120.0, + "37": 39586181120.0, + "38": 39586181120.0, + "39": 39586181120.0, + "40": 39586181120.0, + "41": 39586181120.0, + "42": 39586181120.0, + "43": 39586181120.0, + "44": 39586181120.0, + "45": 39586181120.0, + "46": 39586181120.0, + "47": 39586181120.0, + "48": 39586181120.0, + "49": 39586181120.0, + "50": 39586181120.0 } }, "iteration-time": { @@ -232,56 +232,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 89.14162, - "2": 2.00665, - "3": 3.2832, - "4": 2.63833, - "5": 2.43073, - "6": 1.4868, - "7": 1.81732, - "8": 2.74562, - "9": 1.18286, - "10": 1.18542, - "11": 1.27273, - "12": 1.63885, - "13": 1.31323, - "14": 2.29007, - "15": 1.52021, - "16": 1.87975, - "17": 1.3507, - "18": 1.48627, - "19": 1.17842, - "20": 1.17004, - "21": 1.30369, - "22": 1.24781, - "23": 1.13565, - "24": 1.13418, - "25": 1.21915, - "26": 1.24288, - "27": 1.15052, - "28": 1.12573, - "29": 1.15398, - "30": 1.13143, - "31": 1.17104, - "32": 1.12919, - "33": 1.1286, - "34": 1.14327, - "35": 1.1721, - "36": 1.12494, - "37": 1.2626, - "38": 1.11425, - "39": 1.14594, - "40": 1.18189, - "41": 1.09297, - "42": 1.09247, - "43": 1.18621, - "44": 1.19564, - "45": 1.08252, - "46": 1.08511, - "47": 1.23319, - "48": 1.08249, - "49": 1.0979, - "50": 1.07182 + "1": 65.48328, + "2": 1.94615, + "3": 3.94539, + "4": 2.42699, + "5": 1.80319, + "6": 1.79395, + "7": 1.50546, + "8": 2.00251, + "9": 1.2172, + "10": 1.31071, + "11": 1.3171, + "12": 1.10351, + "13": 1.26314, + "14": 1.47608, + "15": 1.19001, + "16": 1.12949, + "17": 1.15105, + "18": 1.06698, + "19": 1.10069, + "20": 1.12463, + "21": 1.35075, + "22": 1.56258, + "23": 1.2368, + "24": 1.13707, + "25": 1.11826, + "26": 1.09445, + "27": 1.08857, + "28": 1.07964, + "29": 1.08505, + "30": 1.24068, + "31": 1.10419, + "32": 1.5164, + "33": 1.10245, + "34": 1.37977, + "35": 1.1165, + "36": 1.1457, + "37": 1.10487, + "38": 1.08491, + "39": 1.08901, + "40": 1.08968, + "41": 1.13702, + "42": 1.09805, + "43": 1.06669, + "44": 1.07791, + "45": 1.08898, + "46": 1.10717, + "47": 1.13008, + "48": 1.05745, + "49": 1.08268, + "50": 1.05678 } } } \ No newline at end of file From 993789010373c1e1844fb07b80c7ff726bc1c8ad Mon Sep 17 00:00:00 2001 From: Hongbin Liu Date: Wed, 5 Nov 2025 01:04:35 -0800 Subject: [PATCH 05/10] update golden values Signed-off-by: Hongbin Liu --- .../golden_values_dev_dgx_h100.json | 392 +++++++++--------- 1 file changed, 196 insertions(+), 196 deletions(-) diff --git a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/golden_values_dev_dgx_h100.json index f3ef464697..150ba70462 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/golden_values_dev_dgx_h100.json @@ -6,54 +6,54 @@ "values": { "1": 11.04276, "2": 11.02298, - "3": 9.50921, - "4": 10.86244, - "5": 9.36127, - "6": 9.05636, - "7": 9.20064, - "8": 8.98909, - "9": 8.67001, - "10": 9.00892, - "11": 8.50716, - "12": 8.45579, - "13": 8.41197, - "14": 7.92802, - "15": 7.99663, - "16": 8.04156, - "17": 8.06453, - "18": 7.73746, - "19": 8.09946, - "20": 7.85555, - "21": 7.54063, - "22": 7.51142, - "23": 7.39766, - "24": 7.36551, - "25": 7.63399, - "26": 7.04934, - "27": 7.60084, - "28": 7.30223, - "29": 7.47164, - "30": 7.61428, - "31": 7.34981, - "32": 7.53935, - "33": 7.59164, - "34": 7.64951, - "35": 7.18657, - "36": 7.03804, - "37": 7.36778, - "38": 7.14613, - "39": 7.50644, - "40": 7.51103, - "41": 7.44582, - "42": 7.20666, - "43": 7.2123, - "44": 7.37723, - "45": 7.17293, - "46": 6.86188, - "47": 7.2648, - "48": 7.1069, - "49": 7.56115, - "50": 7.00113 + "3": 9.50907, + "4": 10.86145, + "5": 9.36104, + "6": 9.05664, + "7": 9.20646, + "8": 9.00188, + "9": 8.69791, + "10": 8.97535, + "11": 8.48206, + "12": 8.44961, + "13": 8.38916, + "14": 7.90422, + "15": 7.98559, + "16": 8.02787, + "17": 8.04894, + "18": 7.72163, + "19": 8.0935, + "20": 7.85609, + "21": 7.53372, + "22": 7.50495, + "23": 7.39733, + "24": 7.36369, + "25": 7.62993, + "26": 7.04703, + "27": 7.59839, + "28": 7.29807, + "29": 7.46826, + "30": 7.60613, + "31": 7.34795, + "32": 7.53766, + "33": 7.58939, + "34": 7.64431, + "35": 7.18358, + "36": 7.036, + "37": 7.36506, + "38": 7.14525, + "39": 7.50347, + "40": 7.50925, + "41": 7.44415, + "42": 7.20526, + "43": 7.21039, + "44": 7.37585, + "45": 7.1698, + "46": 6.8612, + "47": 7.26258, + "48": 7.1033, + "49": 7.55974, + "50": 6.99878 } }, "num-zeros": { @@ -61,56 +61,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 38808176.0, - "2": 38549232.0, - "3": 38741780.0, - "4": 78604016.0, - "5": 152229680.0, - "6": 299762016.0, - "7": 557587712.0, - "8": 589584384.0, - "9": 482229120.0, - "10": 517739584.0, - "11": 526962624.0, - "12": 476182528.0, - "13": 667453056.0, - "14": 563635200.0, - "15": 592125760.0, - "16": 589362048.0, - "17": 453879424.0, - "18": 444631456.0, - "19": 532791520.0, - "20": 677797248.0, - "21": 545577920.0, - "22": 494731040.0, - "23": 551928576.0, - "24": 489800928.0, - "25": 644993344.0, - "26": 441532864.0, - "27": 467175040.0, - "28": 431687840.0, - "29": 409185824.0, - "30": 583756032.0, - "31": 592451072.0, - "32": 416290048.0, - "33": 391230880.0, - "34": 325273120.0, - "35": 350756576.0, - "36": 331801376.0, - "37": 349196160.0, - "38": 312664800.0, - "39": 419015584.0, - "40": 299035872.0, - "41": 274307296.0, - "42": 296551584.0, - "43": 381740640.0, - "44": 308872480.0, - "45": 263141648.0, - "46": 353360864.0, - "47": 271093472.0, - "48": 346833600.0, - "49": 267589936.0, - "50": 252702768.0 + "1": 38808152.0, + "2": 38549168.0, + "3": 38741680.0, + "4": 81738424.0, + "5": 161659808.0, + "6": 296608000.0, + "7": 557581568.0, + "8": 592711744.0, + "9": 479088640.0, + "10": 520896096.0, + "11": 555256320.0, + "12": 444724480.0, + "13": 658029440.0, + "14": 585665280.0, + "15": 588986240.0, + "16": 479280192.0, + "17": 494748608.0, + "18": 504398944.0, + "19": 601982144.0, + "20": 787884160.0, + "21": 536156160.0, + "22": 513609344.0, + "23": 577056256.0, + "24": 549563712.0, + "25": 648153280.0, + "26": 498150784.0, + "27": 501770816.0, + "28": 522921920.0, + "29": 462644416.0, + "30": 612066112.0, + "31": 605029312.0, + "32": 454036160.0, + "33": 419547936.0, + "34": 378748896.0, + "35": 385339904.0, + "36": 350676768.0, + "37": 478164480.0, + "38": 337833600.0, + "39": 450472544.0, + "40": 267556496.0, + "41": 280614912.0, + "42": 305998368.0, + "43": 372298848.0, + "44": 261697280.0, + "45": 225394720.0, + "46": 268431392.0, + "47": 217617888.0, + "48": 261904016.0, + "49": 229846288.0, + "50": 214954112.0 } }, "mem-allocated-bytes": { @@ -177,54 +177,54 @@ "values": { "1": 37959917568.0, "2": 39578673152.0, - "3": 39583825920.0, - "4": 39583825920.0, - "5": 39586181120.0, - "6": 39586181120.0, - "7": 39586181120.0, - "8": 39586181120.0, - "9": 39586181120.0, - "10": 39586181120.0, - "11": 39586181120.0, - "12": 39586181120.0, - "13": 39586181120.0, - "14": 39586181120.0, - "15": 39586181120.0, - "16": 39586181120.0, - "17": 39586181120.0, - "18": 39586181120.0, - "19": 39586181120.0, - "20": 39586181120.0, - "21": 39586181120.0, - "22": 39586181120.0, - "23": 39586181120.0, - "24": 39586181120.0, - "25": 39586181120.0, - "26": 39586181120.0, - "27": 39586181120.0, - "28": 39586181120.0, - "29": 39586181120.0, - "30": 39586181120.0, - "31": 39586181120.0, - "32": 39586181120.0, - "33": 39586181120.0, - "34": 39586181120.0, - "35": 39586181120.0, - "36": 39586181120.0, - "37": 39586181120.0, - "38": 39586181120.0, - "39": 39586181120.0, - "40": 39586181120.0, - "41": 39586181120.0, - "42": 39586181120.0, - "43": 39586181120.0, - "44": 39586181120.0, - "45": 39586181120.0, - "46": 39586181120.0, - "47": 39586181120.0, - "48": 39586181120.0, - "49": 39586181120.0, - "50": 39586181120.0 + "3": 39583842304.0, + "4": 39583842304.0, + "5": 39584591872.0, + "6": 39584591872.0, + "7": 39584591872.0, + "8": 39584591872.0, + "9": 39584591872.0, + "10": 39584591872.0, + "11": 39584591872.0, + "12": 39584591872.0, + "13": 39584591872.0, + "14": 39584591872.0, + "15": 39584591872.0, + "16": 39584591872.0, + "17": 39584591872.0, + "18": 39584591872.0, + "19": 39584591872.0, + "20": 39584591872.0, + "21": 39584591872.0, + "22": 39584591872.0, + "23": 39584591872.0, + "24": 39584591872.0, + "25": 39584591872.0, + "26": 39584591872.0, + "27": 39584591872.0, + "28": 39584591872.0, + "29": 39584591872.0, + "30": 39584591872.0, + "31": 39584591872.0, + "32": 39584591872.0, + "33": 39584591872.0, + "34": 39584591872.0, + "35": 39584591872.0, + "36": 39584591872.0, + "37": 39584591872.0, + "38": 39584591872.0, + "39": 39584591872.0, + "40": 39584591872.0, + "41": 39584591872.0, + "42": 39584591872.0, + "43": 39584591872.0, + "44": 39584591872.0, + "45": 39584591872.0, + "46": 39584591872.0, + "47": 39584591872.0, + "48": 39584591872.0, + "49": 39584591872.0, + "50": 39584591872.0 } }, "iteration-time": { @@ -232,56 +232,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 65.48328, - "2": 1.94615, - "3": 3.94539, - "4": 2.42699, - "5": 1.80319, - "6": 1.79395, - "7": 1.50546, - "8": 2.00251, - "9": 1.2172, - "10": 1.31071, - "11": 1.3171, - "12": 1.10351, - "13": 1.26314, - "14": 1.47608, - "15": 1.19001, - "16": 1.12949, - "17": 1.15105, - "18": 1.06698, - "19": 1.10069, - "20": 1.12463, - "21": 1.35075, - "22": 1.56258, - "23": 1.2368, - "24": 1.13707, - "25": 1.11826, - "26": 1.09445, - "27": 1.08857, - "28": 1.07964, - "29": 1.08505, - "30": 1.24068, - "31": 1.10419, - "32": 1.5164, - "33": 1.10245, - "34": 1.37977, - "35": 1.1165, - "36": 1.1457, - "37": 1.10487, - "38": 1.08491, - "39": 1.08901, - "40": 1.08968, - "41": 1.13702, - "42": 1.09805, - "43": 1.06669, - "44": 1.07791, - "45": 1.08898, - "46": 1.10717, - "47": 1.13008, - "48": 1.05745, - "49": 1.08268, - "50": 1.05678 + "1": 65.95827, + "2": 1.9924, + "3": 3.92592, + "4": 2.4652, + "5": 1.84842, + "6": 1.80402, + "7": 1.67822, + "8": 1.88485, + "9": 1.32993, + "10": 1.37648, + "11": 1.18596, + "12": 1.16521, + "13": 1.14524, + "14": 1.34968, + "15": 1.22798, + "16": 1.10709, + "17": 1.2737, + "18": 1.12048, + "19": 1.44431, + "20": 1.22659, + "21": 1.23111, + "22": 1.27597, + "23": 1.25479, + "24": 1.12437, + "25": 1.28457, + "26": 1.26411, + "27": 1.16703, + "28": 1.13595, + "29": 1.24774, + "30": 1.10985, + "31": 1.3919, + "32": 1.10386, + "33": 1.20402, + "34": 1.08667, + "35": 1.10247, + "36": 1.09087, + "37": 1.16339, + "38": 1.12236, + "39": 1.10519, + "40": 1.20224, + "41": 1.11719, + "42": 1.18432, + "43": 1.11065, + "44": 1.14205, + "45": 1.12352, + "46": 1.09449, + "47": 1.10298, + "48": 1.10504, + "49": 1.09853, + "50": 1.0939 } } } \ No newline at end of file From 6c83118d55f537feabb1d934ef7437bcd8ed673f Mon Sep 17 00:00:00 2001 From: Hongbin Liu Date: Thu, 6 Nov 2025 02:31:18 -0800 Subject: [PATCH 06/10] update model_config and golden values Signed-off-by: Hongbin Liu --- .../transformer/multi_token_prediction.py | 3 + megatron/training/arguments.py | 2 +- .../golden_values_dev_dgx_h100.json | 600 +++++++++--------- .../model_config.yaml | 6 +- .../golden_values_dev_dgx_h100.json | 500 +++++++-------- .../model_config.yaml | 6 +- 6 files changed, 560 insertions(+), 557 deletions(-) diff --git a/megatron/core/transformer/multi_token_prediction.py b/megatron/core/transformer/multi_token_prediction.py index 945682741d..80f72a91ff 100755 --- a/megatron/core/transformer/multi_token_prediction.py +++ b/megatron/core/transformer/multi_token_prediction.py @@ -29,6 +29,9 @@ make_tp_sharded_tensor_for_checkpoint, make_viewless_tensor, ) +from megatron.core.pipeline_parallel.fine_grained_activation_offload import ( + fine_grained_offloading_set_last_layer +) if is_torch_min_version("1.13.0"): dist_all_gather_func = torch.distributed.all_gather_into_tensor diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py index 3413d1e154..c91bb536fe 100644 --- a/megatron/training/arguments.py +++ b/megatron/training/arguments.py @@ -2346,7 +2346,7 @@ def _add_training_args(parser): group.add_argument('--fine-grained-activation-offloading', action='store_true', help='Enable fine-grained activation offloading.') group.add_argument('--offload-modules', nargs='*', type=str, default=[], - help='The submodules to offload its input. Choices: "attn_norm", "core_attn", "attn_proj", "mlp_norm", "expert_fc1", "moe_act".') + help='The submodules to offload its input. Choices: "attn_norm", "qkv_linear", "core_attn", "attn_proj", "mlp_norm", "expert_fc1", "moe_act".') group.add_argument('--min-offloaded-tensor-size', type=int, default=1024*1024, help='The minimum size of the tensor to be offloaded.') return parser diff --git a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/golden_values_dev_dgx_h100.json index 4b32d4256d..e7f62bbe4a 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/golden_values_dev_dgx_h100.json @@ -4,56 +4,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 11.07559, - "2": 11.03834, - "3": 9.72869, - "4": 9.61678, - "5": 10.63323, - "6": 9.1681, - "7": 9.35196, - "8": 9.05204, - "9": 8.84148, - "10": 9.00321, - "11": 8.49799, - "12": 8.5218, - "13": 8.41649, - "14": 7.9096, - "15": 8.00627, - "16": 8.05394, - "17": 8.0203, - "18": 7.73136, - "19": 8.11676, - "20": 7.83945, - "21": 7.52196, - "22": 7.5295, - "23": 7.38729, - "24": 7.3758, - "25": 7.65255, - "26": 7.04795, - "27": 7.591, - "28": 7.30023, - "29": 7.45656, - "30": 7.60935, - "31": 7.3713, - "32": 7.55298, - "33": 7.59738, - "34": 7.65764, - "35": 7.17916, - "36": 7.04913, - "37": 7.38022, - "38": 7.14883, - "39": 7.50321, - "40": 7.51595, - "41": 7.45139, - "42": 7.21197, - "43": 7.21131, - "44": 7.38058, - "45": 7.16397, - "46": 6.86108, - "47": 7.27247, - "48": 7.10862, - "49": 7.56398, - "50": 7.00523 + "1": 11.06715, + "2": 11.06051, + "3": 10.21154, + "4": 9.95175, + "5": 10.12622, + "6": 8.82146, + "7": 9.52879, + "8": 8.442, + "9": 7.84738, + "10": 7.07075, + "11": 9.31042, + "12": 9.16013, + "13": 7.87292, + "14": 8.2102, + "15": 8.22483, + "16": 8.17879, + "17": 8.21121, + "18": 7.50325, + "19": 8.08274, + "20": 7.62562, + "21": 7.95058, + "22": 7.29789, + "23": 7.93775, + "24": 7.44169, + "25": 8.23817, + "26": 7.74959, + "27": 7.69344, + "28": 7.65487, + "29": 7.75173, + "30": 7.56007, + "31": 7.81567, + "32": 6.46589, + "33": 7.20401, + "34": 7.77921, + "35": 7.72944, + "36": 6.71776, + "37": 8.08311, + "38": 7.6137, + "39": 7.96476, + "40": 7.50072, + "41": 7.50304, + "42": 6.11349, + "43": 7.59404, + "44": 7.91361, + "45": 6.83615, + "46": 7.41293, + "47": 7.79226, + "48": 7.87549, + "49": 7.58763, + "50": 6.84525 } }, "num-zeros": { @@ -61,56 +61,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 38802120.0, - "2": 38543052.0, - "3": 38738396.0, - "4": 113220144.0, - "5": 344100160.0, - "6": 435062816.0, - "7": 579598912.0, - "8": 819195200.0, - "9": 604910464.0, - "10": 690749824.0, - "11": 744002496.0, - "12": 520212192.0, - "13": 547932992.0, - "14": 585659584.0, - "15": 614149184.0, - "16": 664915328.0, - "17": 592272320.0, - "18": 630225856.0, - "19": 579959808.0, - "20": 800470080.0, - "21": 573941056.0, - "22": 557652032.0, - "23": 797256640.0, - "24": 826380864.0, - "25": 814860160.0, - "26": 617708032.0, - "27": 715680384.0, - "28": 548045824.0, - "29": 736312064.0, - "30": 722163456.0, - "31": 711986176.0, - "32": 674238208.0, - "33": 715239232.0, - "34": 677588288.0, - "35": 473423392.0, - "36": 451352800.0, - "37": 446739392.0, - "38": 567466304.0, - "39": 472519552.0, - "40": 434322048.0, - "41": 554276096.0, - "42": 526187424.0, - "43": 510713152.0, - "44": 522783808.0, - "45": 335511072.0, - "46": 450878784.0, - "47": 450397344.0, - "48": 321720704.0, - "49": 437443680.0, - "50": 419425088.0 + "1": 47165192.0, + "2": 46897912.0, + "3": 52684456.0, + "4": 297127552.0, + "5": 562950784.0, + "6": 668142144.0, + "7": 1027449536.0, + "8": 752259328.0, + "9": 830947776.0, + "10": 718307136.0, + "11": 823731840.0, + "12": 804867840.0, + "13": 639461056.0, + "14": 625408576.0, + "15": 716256960.0, + "16": 870866752.0, + "17": 673817856.0, + "18": 811900096.0, + "19": 892689024.0, + "20": 878114112.0, + "21": 666859968.0, + "22": 792718848.0, + "23": 783683200.0, + "24": 770686976.0, + "25": 651376640.0, + "26": 780070272.0, + "27": 801722496.0, + "28": 670273664.0, + "29": 647960768.0, + "30": 789867776.0, + "31": 801385856.0, + "32": 787688640.0, + "33": 783506816.0, + "34": 792837760.0, + "35": 776103936.0, + "36": 761920512.0, + "37": 775085824.0, + "38": 752868608.0, + "39": 754997184.0, + "40": 745075072.0, + "41": 713941440.0, + "42": 689968512.0, + "43": 663461824.0, + "44": 680285632.0, + "45": 644628992.0, + "46": 641672704.0, + "47": 642439616.0, + "48": 597700608.0, + "49": 603523520.0, + "50": 601014528.0 } }, "mem-allocated-bytes": { @@ -118,56 +118,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 5498340864.0, - "2": 5499135488.0, - "3": 5499928064.0, - "4": 5500720640.0, - "5": 5501513216.0, - "6": 5502305792.0, - "7": 5497946624.0, - "8": 5498739200.0, - "9": 5499531776.0, - "10": 5500324352.0, - "11": 5501116928.0, - "12": 5498342912.0, - "13": 5499135488.0, - "14": 5499928064.0, - "15": 5500720640.0, - "16": 5501513216.0, - "17": 5502305792.0, - "18": 5503098368.0, - "19": 5503890944.0, - "20": 5504683520.0, - "21": 5505476096.0, - "22": 5506268672.0, - "23": 5507061248.0, - "24": 5507853824.0, - "25": 5508646400.0, - "26": 5509438976.0, - "27": 5510231552.0, - "28": 5511024128.0, - "29": 5511816704.0, - "30": 5512609280.0, - "31": 5513401856.0, - "32": 5514194432.0, - "33": 5514987008.0, - "34": 5515779584.0, - "35": 5516572160.0, - "36": 5517364736.0, - "37": 5518157312.0, - "38": 5518949888.0, - "39": 5519742464.0, - "40": 5520535040.0, - "41": 5521327616.0, - "42": 5522120192.0, - "43": 5522912768.0, - "44": 5523705344.0, - "45": 5524497920.0, - "46": 5525290496.0, - "47": 5526083072.0, - "48": 5526875648.0, - "49": 5527668224.0, - "50": 5528460800.0 + "1": 5290944000.0, + "2": 5291148800.0, + "3": 5291351552.0, + "4": 5290946048.0, + "5": 5291148800.0, + "6": 5291351552.0, + "7": 5291554304.0, + "8": 5291757056.0, + "9": 5291959808.0, + "10": 5292162560.0, + "11": 5292365312.0, + "12": 5292568064.0, + "13": 5292770816.0, + "14": 5292973568.0, + "15": 5293176320.0, + "16": 5293379072.0, + "17": 5293581824.0, + "18": 5293784576.0, + "19": 5293987328.0, + "20": 5294190080.0, + "21": 5294392832.0, + "22": 5294595584.0, + "23": 5294798336.0, + "24": 5295001088.0, + "25": 5295203840.0, + "26": 5295406592.0, + "27": 5295609344.0, + "28": 5295812096.0, + "29": 5296014848.0, + "30": 5296217600.0, + "31": 5296420352.0, + "32": 5296623104.0, + "33": 5296825856.0, + "34": 5297028608.0, + "35": 5297231360.0, + "36": 5297434112.0, + "37": 5297636864.0, + "38": 5297839616.0, + "39": 5298042368.0, + "40": 5298245120.0, + "41": 5298447872.0, + "42": 5298650624.0, + "43": 5298853376.0, + "44": 5299056128.0, + "45": 5299258880.0, + "46": 5299461632.0, + "47": 5299664384.0, + "48": 5299867136.0, + "49": 5300069888.0, + "50": 5300272640.0 } }, "mem-max-allocated-bytes": { @@ -175,56 +175,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 41723441152.0, - "2": 43687280640.0, - "3": 43916578816.0, - "4": 43916578816.0, - "5": 43916578816.0, - "6": 43916578816.0, - "7": 43916578816.0, - "8": 43916578816.0, - "9": 43916578816.0, - "10": 43916578816.0, - "11": 43916578816.0, - "12": 44028436480.0, - "13": 44028436480.0, - "14": 44028436480.0, - "15": 44028436480.0, - "16": 44028436480.0, - "17": 44028436480.0, - "18": 44028436480.0, - "19": 44028436480.0, - "20": 44028436480.0, - "21": 44028436480.0, - "22": 44028436480.0, - "23": 44028436480.0, - "24": 44028436480.0, - "25": 44028436480.0, - "26": 44028436480.0, - "27": 44028436480.0, - "28": 44028436480.0, - "29": 44028436480.0, - "30": 44028436480.0, - "31": 44028436480.0, - "32": 44028436480.0, - "33": 44028436480.0, - "34": 44028436480.0, - "35": 44028436480.0, - "36": 44028436480.0, - "37": 44028436480.0, - "38": 44028436480.0, - "39": 44028436480.0, - "40": 44028436480.0, - "41": 44028436480.0, - "42": 44028436480.0, - "43": 44028436480.0, - "44": 44028436480.0, - "45": 44028436480.0, - "46": 44028436480.0, - "47": 44028436480.0, - "48": 44028436480.0, - "49": 44028436480.0, - "50": 44028436480.0 + "1": 6180783616.0, + "2": 8225679872.0, + "3": 8225679872.0, + "4": 8225679872.0, + "5": 8225679872.0, + "6": 8225679872.0, + "7": 8225679872.0, + "8": 8225679872.0, + "9": 8225679872.0, + "10": 8225679872.0, + "11": 8239991296.0, + "12": 8239991296.0, + "13": 8239991296.0, + "14": 8239991296.0, + "15": 8239991296.0, + "16": 8239991296.0, + "17": 8244914688.0, + "18": 8244914688.0, + "19": 8244914688.0, + "20": 8265598464.0, + "21": 8265598464.0, + "22": 8265598464.0, + "23": 8265598464.0, + "24": 8265598464.0, + "25": 8265598464.0, + "26": 8265598464.0, + "27": 8265598464.0, + "28": 8265598464.0, + "29": 8271664640.0, + "30": 8316803584.0, + "31": 8316803584.0, + "32": 8316803584.0, + "33": 8316803584.0, + "34": 8316803584.0, + "35": 8316803584.0, + "36": 8316803584.0, + "37": 8316803584.0, + "38": 8316803584.0, + "39": 8318923264.0, + "40": 8318923264.0, + "41": 8318923264.0, + "42": 8318923264.0, + "43": 8318923264.0, + "44": 8318923264.0, + "45": 8318923264.0, + "46": 8318923264.0, + "47": 8318923264.0, + "48": 8318923264.0, + "49": 8318923264.0, + "50": 8318923264.0 } }, "mtp_1 loss": { @@ -232,56 +232,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 11.08623, - "2": 11.1047, - "3": 10.54469, - "4": 10.08474, - "5": 9.76549, - "6": 9.56242, - "7": 9.59473, - "8": 8.97686, - "9": 8.83293, - "10": 9.1193, - "11": 8.44318, - "12": 8.49593, - "13": 8.37985, - "14": 7.81516, - "15": 7.95146, - "16": 8.01718, - "17": 7.94503, - "18": 7.68603, - "19": 8.07501, - "20": 7.79558, - "21": 7.46867, - "22": 7.46603, - "23": 7.32734, - "24": 7.32819, - "25": 7.58465, - "26": 6.99257, - "27": 7.53486, - "28": 7.23432, - "29": 7.40501, - "30": 7.55005, - "31": 7.30085, - "32": 7.48028, - "33": 7.53593, - "34": 7.60112, - "35": 7.12344, - "36": 6.99007, - "37": 7.32578, - "38": 7.09623, - "39": 7.45759, - "40": 7.45018, - "41": 7.40101, - "42": 7.14459, - "43": 7.13995, - "44": 7.32066, - "45": 7.0966, - "46": 6.80106, - "47": 7.21219, - "48": 7.05021, - "49": 7.48165, - "50": 6.95118 + "1": 11.07395, + "2": 11.0927, + "3": 10.82648, + "4": 10.27524, + "5": 10.45343, + "6": 8.32789, + "7": 9.82687, + "8": 8.01561, + "9": 7.47686, + "10": 6.75778, + "11": 8.92977, + "12": 8.98867, + "13": 7.80263, + "14": 8.02637, + "15": 8.11184, + "16": 8.13967, + "17": 8.13444, + "18": 7.44744, + "19": 8.03657, + "20": 7.53993, + "21": 7.90129, + "22": 7.27518, + "23": 7.88304, + "24": 7.37567, + "25": 8.16836, + "26": 7.69935, + "27": 7.6262, + "28": 7.61271, + "29": 7.69819, + "30": 7.4848, + "31": 7.73967, + "32": 6.36884, + "33": 7.14295, + "34": 7.71844, + "35": 7.63485, + "36": 6.61195, + "37": 8.02821, + "38": 7.57841, + "39": 7.89473, + "40": 7.41461, + "41": 7.42116, + "42": 6.01344, + "43": 7.4906, + "44": 7.86418, + "45": 6.74814, + "46": 7.30484, + "47": 7.72617, + "48": 7.79074, + "49": 7.49049, + "50": 6.75504 } }, "iteration-time": { @@ -289,56 +289,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 87.66203, - "2": 2.04189, - "3": 3.34278, - "4": 3.72414, - "5": 3.23492, - "6": 1.94546, - "7": 2.14942, - "8": 1.78075, - "9": 1.06029, - "10": 2.13554, - "11": 1.42578, - "12": 1.80986, - "13": 1.06134, - "14": 1.087, - "15": 1.16687, - "16": 1.20412, - "17": 1.06984, - "18": 1.07557, - "19": 1.04081, - "20": 1.21763, - "21": 1.06196, - "22": 1.14038, - "23": 2.25761, - "24": 1.09161, - "25": 1.04319, - "26": 1.40025, - "27": 1.04974, - "28": 1.03984, - "29": 1.05293, - "30": 1.48942, - "31": 1.04785, - "32": 1.0529, - "33": 1.04366, - "34": 1.0633, - "35": 1.0713, - "36": 1.05711, - "37": 1.08085, - "38": 1.07006, - "39": 1.06498, - "40": 1.05913, - "41": 1.0697, - "42": 1.079, - "43": 1.14122, - "44": 1.06478, - "45": 1.04692, - "46": 1.08174, - "47": 1.07595, - "48": 1.10523, - "49": 1.0839, - "50": 1.07754 + "1": 90.97535, + "2": 4.15413, + "3": 4.25282, + "4": 5.50314, + "5": 4.36528, + "6": 4.16016, + "7": 4.60989, + "8": 3.68392, + "9": 3.70951, + "10": 3.66417, + "11": 3.64904, + "12": 3.66094, + "13": 3.68824, + "14": 3.64996, + "15": 3.64159, + "16": 3.68269, + "17": 3.66905, + "18": 4.10783, + "19": 3.63362, + "20": 3.65129, + "21": 3.6431, + "22": 3.64946, + "23": 3.6411, + "24": 3.59707, + "25": 3.55364, + "26": 3.61478, + "27": 3.59779, + "28": 3.58741, + "29": 3.62545, + "30": 3.63538, + "31": 3.58264, + "32": 3.65914, + "33": 3.62764, + "34": 3.61962, + "35": 3.57076, + "36": 3.59244, + "37": 3.68499, + "38": 3.6803, + "39": 3.5849, + "40": 3.59019, + "41": 3.62068, + "42": 3.69144, + "43": 3.71863, + "44": 3.67193, + "45": 3.65673, + "46": 3.66919, + "47": 3.58334, + "48": 3.57229, + "49": 3.66195, + "50": 3.64157 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/model_config.yaml index 487382042b..c657b9087e 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/model_config.yaml @@ -23,8 +23,8 @@ MODEL_ARGS: --use-mcore-models: true --sequence-parallel: true --disable-bias-linear: true - --micro-batch-size: 4 - --global-batch-size: 32 + --micro-batch-size: 1 + --global-batch-size: 8 --train-iters: 50 --exit-duration-in-mins: 230 --no-check-for-nan-in-loss-and-grad: true @@ -36,7 +36,7 @@ MODEL_ARGS: --recompute-granularity: selective --recompute-modules: "[layernorm mla_up_proj mlp moe_act]" --fine-grained-activation-offloading: true - --offload-modules: "[expert_fc1 moe_act attn_norm mlp_norm]" + --offload-modules: "[expert_fc1 moe_act attn_norm mlp_norm qkv_linear core_attn attn_proj]" # Transformer Engine args --transformer-impl: transformer_engine # Data args diff --git a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/golden_values_dev_dgx_h100.json index 150ba70462..1483224813 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/golden_values_dev_dgx_h100.json @@ -4,56 +4,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 11.04276, - "2": 11.02298, - "3": 9.50907, - "4": 10.86145, - "5": 9.36104, - "6": 9.05664, - "7": 9.20646, - "8": 9.00188, - "9": 8.69791, - "10": 8.97535, - "11": 8.48206, - "12": 8.44961, - "13": 8.38916, - "14": 7.90422, - "15": 7.98559, - "16": 8.02787, - "17": 8.04894, - "18": 7.72163, - "19": 8.0935, - "20": 7.85609, - "21": 7.53372, - "22": 7.50495, - "23": 7.39733, - "24": 7.36369, - "25": 7.62993, - "26": 7.04703, - "27": 7.59839, - "28": 7.29807, - "29": 7.46826, - "30": 7.60613, - "31": 7.34795, - "32": 7.53766, - "33": 7.58939, - "34": 7.64431, - "35": 7.18358, - "36": 7.036, - "37": 7.36506, - "38": 7.14525, - "39": 7.50347, - "40": 7.50925, - "41": 7.44415, - "42": 7.20526, - "43": 7.21039, - "44": 7.37585, - "45": 7.1698, - "46": 6.8612, - "47": 7.26258, - "48": 7.1033, - "49": 7.55974, - "50": 6.99878 + "1": 11.01686, + "2": 11.06264, + "3": 10.17771, + "4": 10.86294, + "5": 9.81711, + "6": 9.10377, + "7": 9.61048, + "8": 8.39441, + "9": 7.79453, + "10": 7.15206, + "11": 9.06579, + "12": 12.40166, + "13": 8.04847, + "14": 8.24594, + "15": 8.24907, + "16": 8.32751, + "17": 8.35488, + "18": 7.58028, + "19": 8.18771, + "20": 7.71954, + "21": 8.00698, + "22": 7.35089, + "23": 7.95479, + "24": 7.51289, + "25": 8.32529, + "26": 7.78885, + "27": 7.72725, + "28": 7.71319, + "29": 7.77361, + "30": 7.56799, + "31": 7.85271, + "32": 6.52658, + "33": 7.24362, + "34": 7.80331, + "35": 7.74511, + "36": 6.73702, + "37": 8.15605, + "38": 7.62885, + "39": 7.97707, + "40": 7.52037, + "41": 7.52443, + "42": 6.12689, + "43": 7.60467, + "44": 7.96883, + "45": 6.84543, + "46": 7.42548, + "47": 7.82723, + "48": 7.87988, + "49": 7.59963, + "50": 6.85112 } }, "num-zeros": { @@ -61,56 +61,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 38808152.0, - "2": 38549168.0, - "3": 38741680.0, - "4": 81738424.0, - "5": 161659808.0, - "6": 296608000.0, - "7": 557581568.0, - "8": 592711744.0, - "9": 479088640.0, - "10": 520896096.0, - "11": 555256320.0, - "12": 444724480.0, - "13": 658029440.0, - "14": 585665280.0, - "15": 588986240.0, - "16": 479280192.0, - "17": 494748608.0, - "18": 504398944.0, - "19": 601982144.0, - "20": 787884160.0, - "21": 536156160.0, - "22": 513609344.0, - "23": 577056256.0, - "24": 549563712.0, - "25": 648153280.0, - "26": 498150784.0, - "27": 501770816.0, - "28": 522921920.0, - "29": 462644416.0, - "30": 612066112.0, - "31": 605029312.0, - "32": 454036160.0, - "33": 419547936.0, - "34": 378748896.0, - "35": 385339904.0, - "36": 350676768.0, - "37": 478164480.0, - "38": 337833600.0, - "39": 450472544.0, - "40": 267556496.0, - "41": 280614912.0, - "42": 305998368.0, - "43": 372298848.0, - "44": 261697280.0, - "45": 225394720.0, - "46": 268431392.0, - "47": 217617888.0, - "48": 261904016.0, - "49": 229846288.0, - "50": 214954112.0 + "1": 47167840.0, + "2": 46900628.0, + "3": 81003512.0, + "4": 243621808.0, + "5": 468555040.0, + "6": 561181184.0, + "7": 958267392.0, + "8": 720794112.0, + "9": 771164224.0, + "10": 718302016.0, + "11": 669618304.0, + "12": 559500096.0, + "13": 642601344.0, + "14": 754397952.0, + "15": 766531584.0, + "16": 697850240.0, + "17": 654906240.0, + "18": 745861824.0, + "19": 738620928.0, + "20": 887555328.0, + "21": 729800064.0, + "22": 666937216.0, + "23": 777389312.0, + "24": 607175552.0, + "25": 855782784.0, + "26": 846129152.0, + "27": 666477056.0, + "28": 830677504.0, + "29": 811523712.0, + "30": 657771072.0, + "31": 609501440.0, + "32": 784538816.0, + "33": 755198720.0, + "34": 729929280.0, + "35": 719482368.0, + "36": 699006208.0, + "37": 727900096.0, + "38": 711973824.0, + "39": 701515264.0, + "40": 682162752.0, + "41": 534678112.0, + "42": 655361792.0, + "43": 663463424.0, + "44": 642541952.0, + "45": 455907168.0, + "46": 613359936.0, + "47": 592108160.0, + "48": 585115008.0, + "49": 559483008.0, + "50": 544390208.0 } }, "mem-allocated-bytes": { @@ -118,56 +118,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 4419107328.0, - "2": 4419108864.0, - "3": 4419108864.0, - "4": 4419108864.0, - "5": 4419108864.0, - "6": 4419108864.0, - "7": 4419108864.0, - "8": 4419108864.0, - "9": 4419108864.0, - "10": 4419108864.0, - "11": 4419108864.0, - "12": 4419108864.0, - "13": 4419108864.0, - "14": 4419108864.0, - "15": 4419108864.0, - "16": 4419108864.0, - "17": 4419108864.0, - "18": 4419108864.0, - "19": 4419108864.0, - "20": 4419108864.0, - "21": 4419108864.0, - "22": 4419108864.0, - "23": 4419108864.0, - "24": 4419108864.0, - "25": 4419108864.0, - "26": 4419108864.0, - "27": 4419108864.0, - "28": 4419108864.0, - "29": 4419108864.0, - "30": 4419108864.0, - "31": 4419108864.0, - "32": 4419108864.0, - "33": 4419108864.0, - "34": 4419108864.0, - "35": 4419108864.0, - "36": 4419108864.0, - "37": 4419108864.0, - "38": 4419108864.0, - "39": 4419108864.0, - "40": 4419108864.0, - "41": 4419108864.0, - "42": 4419108864.0, - "43": 4419108864.0, - "44": 4419108864.0, - "45": 4419108864.0, - "46": 4419108864.0, - "47": 4419108864.0, - "48": 4419108864.0, - "49": 4419108864.0, - "50": 4419108864.0 + "1": 4315544064.0, + "2": 4315545600.0, + "3": 4315545600.0, + "4": 4315545600.0, + "5": 4315545600.0, + "6": 4315545600.0, + "7": 4315545600.0, + "8": 4315545600.0, + "9": 4315545600.0, + "10": 4315545600.0, + "11": 4315545600.0, + "12": 4315545600.0, + "13": 4315545600.0, + "14": 4315545600.0, + "15": 4315545600.0, + "16": 4315545600.0, + "17": 4315545600.0, + "18": 4315545600.0, + "19": 4315545600.0, + "20": 4315545600.0, + "21": 4315545600.0, + "22": 4315545600.0, + "23": 4315545600.0, + "24": 4315545600.0, + "25": 4315545600.0, + "26": 4315545600.0, + "27": 4315545600.0, + "28": 4315545600.0, + "29": 4315545600.0, + "30": 4315545600.0, + "31": 4315545600.0, + "32": 4315545600.0, + "33": 4315545600.0, + "34": 4315545600.0, + "35": 4315545600.0, + "36": 4315545600.0, + "37": 4315545600.0, + "38": 4315545600.0, + "39": 4315545600.0, + "40": 4315545600.0, + "41": 4315545600.0, + "42": 4315545600.0, + "43": 4315545600.0, + "44": 4315545600.0, + "45": 4315545600.0, + "46": 4315545600.0, + "47": 4315545600.0, + "48": 4315545600.0, + "49": 4315545600.0, + "50": 4315545600.0 } }, "mem-max-allocated-bytes": { @@ -175,56 +175,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 37959917568.0, - "2": 39578673152.0, - "3": 39583842304.0, - "4": 39583842304.0, - "5": 39584591872.0, - "6": 39584591872.0, - "7": 39584591872.0, - "8": 39584591872.0, - "9": 39584591872.0, - "10": 39584591872.0, - "11": 39584591872.0, - "12": 39584591872.0, - "13": 39584591872.0, - "14": 39584591872.0, - "15": 39584591872.0, - "16": 39584591872.0, - "17": 39584591872.0, - "18": 39584591872.0, - "19": 39584591872.0, - "20": 39584591872.0, - "21": 39584591872.0, - "22": 39584591872.0, - "23": 39584591872.0, - "24": 39584591872.0, - "25": 39584591872.0, - "26": 39584591872.0, - "27": 39584591872.0, - "28": 39584591872.0, - "29": 39584591872.0, - "30": 39584591872.0, - "31": 39584591872.0, - "32": 39584591872.0, - "33": 39584591872.0, - "34": 39584591872.0, - "35": 39584591872.0, - "36": 39584591872.0, - "37": 39584591872.0, - "38": 39584591872.0, - "39": 39584591872.0, - "40": 39584591872.0, - "41": 39584591872.0, - "42": 39584591872.0, - "43": 39584591872.0, - "44": 39584591872.0, - "45": 39584591872.0, - "46": 39584591872.0, - "47": 39584591872.0, - "48": 39584591872.0, - "49": 39584591872.0, - "50": 39584591872.0 + "1": 4919527424.0, + "2": 5861408768.0, + "3": 5861408768.0, + "4": 5863651328.0, + "5": 5863651328.0, + "6": 5863651328.0, + "7": 5863651328.0, + "8": 5863651328.0, + "9": 5863651328.0, + "10": 5863651328.0, + "11": 5863651328.0, + "12": 5863651328.0, + "13": 5863651328.0, + "14": 5863651328.0, + "15": 5863986176.0, + "16": 5865795072.0, + "17": 5865795072.0, + "18": 5865795072.0, + "19": 5865795072.0, + "20": 5865795072.0, + "21": 5866987520.0, + "22": 5866987520.0, + "23": 5866987520.0, + "24": 5866987520.0, + "25": 5866987520.0, + "26": 5866987520.0, + "27": 5866987520.0, + "28": 5866987520.0, + "29": 5866987520.0, + "30": 5866987520.0, + "31": 5866987520.0, + "32": 5866987520.0, + "33": 5866987520.0, + "34": 5866987520.0, + "35": 5866987520.0, + "36": 5866987520.0, + "37": 5866987520.0, + "38": 5866987520.0, + "39": 5866987520.0, + "40": 5866987520.0, + "41": 5866987520.0, + "42": 5866987520.0, + "43": 5866987520.0, + "44": 5866987520.0, + "45": 5866987520.0, + "46": 5866987520.0, + "47": 5866987520.0, + "48": 5866987520.0, + "49": 5866987520.0, + "50": 5866987520.0 } }, "iteration-time": { @@ -232,56 +232,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 65.95827, - "2": 1.9924, - "3": 3.92592, - "4": 2.4652, - "5": 1.84842, - "6": 1.80402, - "7": 1.67822, - "8": 1.88485, - "9": 1.32993, - "10": 1.37648, - "11": 1.18596, - "12": 1.16521, - "13": 1.14524, - "14": 1.34968, - "15": 1.22798, - "16": 1.10709, - "17": 1.2737, - "18": 1.12048, - "19": 1.44431, - "20": 1.22659, - "21": 1.23111, - "22": 1.27597, - "23": 1.25479, - "24": 1.12437, - "25": 1.28457, - "26": 1.26411, - "27": 1.16703, - "28": 1.13595, - "29": 1.24774, - "30": 1.10985, - "31": 1.3919, - "32": 1.10386, - "33": 1.20402, - "34": 1.08667, - "35": 1.10247, - "36": 1.09087, - "37": 1.16339, - "38": 1.12236, - "39": 1.10519, - "40": 1.20224, - "41": 1.11719, - "42": 1.18432, - "43": 1.11065, - "44": 1.14205, - "45": 1.12352, - "46": 1.09449, - "47": 1.10298, - "48": 1.10504, - "49": 1.09853, - "50": 1.0939 + "1": 72.699, + "2": 4.27015, + "3": 3.87365, + "4": 3.67041, + "5": 3.65964, + "6": 3.48532, + "7": 3.47679, + "8": 3.47349, + "9": 3.43879, + "10": 3.47441, + "11": 3.45737, + "12": 3.48691, + "13": 3.54474, + "14": 3.44102, + "15": 3.42127, + "16": 3.45795, + "17": 3.49717, + "18": 3.51293, + "19": 3.5617, + "20": 3.49733, + "21": 3.50336, + "22": 3.62308, + "23": 3.50166, + "24": 3.49075, + "25": 3.50996, + "26": 3.44423, + "27": 3.47323, + "28": 3.53784, + "29": 3.51989, + "30": 3.49211, + "31": 3.49945, + "32": 3.4419, + "33": 3.50458, + "34": 3.47663, + "35": 3.45702, + "36": 3.50281, + "37": 3.44136, + "38": 3.45165, + "39": 3.50095, + "40": 3.50126, + "41": 3.50863, + "42": 3.46684, + "43": 3.55122, + "44": 3.48372, + "45": 3.46903, + "46": 3.47654, + "47": 3.51574, + "48": 3.4895, + "49": 3.49404, + "50": 3.45824 } } } \ No newline at end of file diff --git a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/model_config.yaml b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/model_config.yaml index 28ad106f52..5b177ed116 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/model_config.yaml +++ b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/model_config.yaml @@ -23,8 +23,8 @@ MODEL_ARGS: --use-mcore-models: true --sequence-parallel: true --disable-bias-linear: true - --micro-batch-size: 4 - --global-batch-size: 32 + --micro-batch-size: 1 + --global-batch-size: 8 --train-iters: 50 --exit-duration-in-mins: 230 --no-check-for-nan-in-loss-and-grad: true @@ -36,7 +36,7 @@ MODEL_ARGS: --recompute-granularity: selective --recompute-modules: "[layernorm mla_up_proj mlp moe_act]" --fine-grained-activation-offloading: true - --offload-modules: "[expert_fc1 moe_act attn_norm mlp_norm]" + --offload-modules: "[expert_fc1 moe_act attn_norm mlp_norm qkv_linear core_attn attn_proj]" # Transformer Engine args --transformer-impl: transformer_engine # Data args From 33a38f51c735048c1df92a0ea39e289aba6a85de Mon Sep 17 00:00:00 2001 From: Hongbin Liu Date: Thu, 6 Nov 2025 02:33:31 -0800 Subject: [PATCH 07/10] format Signed-off-by: Hongbin Liu --- megatron/core/transformer/multi_token_prediction.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/megatron/core/transformer/multi_token_prediction.py b/megatron/core/transformer/multi_token_prediction.py index 80f72a91ff..d8d20039e4 100755 --- a/megatron/core/transformer/multi_token_prediction.py +++ b/megatron/core/transformer/multi_token_prediction.py @@ -13,6 +13,9 @@ from megatron.core.fp8_utils import get_fp8_context from megatron.core.models.backends import BackendSpecProvider, LocalSpecProvider from megatron.core.packed_seq_params import PackedSeqParams +from megatron.core.pipeline_parallel.fine_grained_activation_offload import ( + fine_grained_offloading_set_last_layer, +) from megatron.core.process_groups_config import ProcessGroupCollection from megatron.core.tensor_parallel import ( gather_from_tensor_model_parallel_region, @@ -29,9 +32,6 @@ make_tp_sharded_tensor_for_checkpoint, make_viewless_tensor, ) -from megatron.core.pipeline_parallel.fine_grained_activation_offload import ( - fine_grained_offloading_set_last_layer -) if is_torch_min_version("1.13.0"): dist_all_gather_func = torch.distributed.all_gather_into_tensor From 6c76b07a07d86e961c896834e29fdd4b02b135b2 Mon Sep 17 00:00:00 2001 From: Hongbin Liu Date: Thu, 6 Nov 2025 06:15:51 -0800 Subject: [PATCH 08/10] update golden values Signed-off-by: Hongbin Liu --- .../golden_values_dev_dgx_h100.json | 390 +++++++++--------- 1 file changed, 195 insertions(+), 195 deletions(-) diff --git a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/golden_values_dev_dgx_h100.json index 1483224813..f31e858405 100644 --- a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/golden_values_dev_dgx_h100.json +++ b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/golden_values_dev_dgx_h100.json @@ -6,54 +6,54 @@ "values": { "1": 11.01686, "2": 11.06264, - "3": 10.17771, - "4": 10.86294, - "5": 9.81711, - "6": 9.10377, - "7": 9.61048, - "8": 8.39441, - "9": 7.79453, - "10": 7.15206, - "11": 9.06579, - "12": 12.40166, - "13": 8.04847, - "14": 8.24594, - "15": 8.24907, - "16": 8.32751, - "17": 8.35488, - "18": 7.58028, - "19": 8.18771, - "20": 7.71954, - "21": 8.00698, - "22": 7.35089, - "23": 7.95479, - "24": 7.51289, - "25": 8.32529, - "26": 7.78885, - "27": 7.72725, - "28": 7.71319, - "29": 7.77361, - "30": 7.56799, - "31": 7.85271, - "32": 6.52658, - "33": 7.24362, - "34": 7.80331, - "35": 7.74511, - "36": 6.73702, - "37": 8.15605, - "38": 7.62885, - "39": 7.97707, - "40": 7.52037, - "41": 7.52443, - "42": 6.12689, - "43": 7.60467, - "44": 7.96883, - "45": 6.84543, - "46": 7.42548, - "47": 7.82723, - "48": 7.87988, - "49": 7.59963, - "50": 6.85112 + "3": 10.17793, + "4": 10.86283, + "5": 9.81719, + "6": 9.10416, + "7": 9.61067, + "8": 8.39543, + "9": 7.79835, + "10": 7.15295, + "11": 9.06686, + "12": 12.40969, + "13": 8.05055, + "14": 8.2476, + "15": 8.25138, + "16": 8.32761, + "17": 8.33769, + "18": 7.57521, + "19": 8.18843, + "20": 7.70464, + "21": 8.00008, + "22": 7.35567, + "23": 7.9428, + "24": 7.49828, + "25": 8.31989, + "26": 7.79139, + "27": 7.72813, + "28": 7.70354, + "29": 7.77157, + "30": 7.56925, + "31": 7.85097, + "32": 6.53309, + "33": 7.24762, + "34": 7.79993, + "35": 7.74601, + "36": 6.74083, + "37": 8.15463, + "38": 7.62637, + "39": 7.97973, + "40": 7.52426, + "41": 7.52118, + "42": 6.11695, + "43": 7.60509, + "44": 7.96979, + "45": 6.84567, + "46": 7.4309, + "47": 7.82486, + "48": 7.87887, + "49": 7.59924, + "50": 6.85064 } }, "num-zeros": { @@ -61,56 +61,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 47167840.0, - "2": 46900628.0, - "3": 81003512.0, - "4": 243621808.0, - "5": 468555040.0, - "6": 561181184.0, - "7": 958267392.0, - "8": 720794112.0, - "9": 771164224.0, - "10": 718302016.0, - "11": 669618304.0, - "12": 559500096.0, - "13": 642601344.0, - "14": 754397952.0, - "15": 766531584.0, - "16": 697850240.0, - "17": 654906240.0, - "18": 745861824.0, - "19": 738620928.0, - "20": 887555328.0, - "21": 729800064.0, - "22": 666937216.0, - "23": 777389312.0, - "24": 607175552.0, - "25": 855782784.0, - "26": 846129152.0, - "27": 666477056.0, - "28": 830677504.0, - "29": 811523712.0, - "30": 657771072.0, - "31": 609501440.0, - "32": 784538816.0, - "33": 755198720.0, - "34": 729929280.0, - "35": 719482368.0, - "36": 699006208.0, - "37": 727900096.0, - "38": 711973824.0, - "39": 701515264.0, - "40": 682162752.0, - "41": 534678112.0, - "42": 655361792.0, - "43": 663463424.0, - "44": 642541952.0, - "45": 455907168.0, - "46": 613359936.0, - "47": 592108160.0, - "48": 585115008.0, - "49": 559483008.0, - "50": 544390208.0 + "1": 47167816.0, + "2": 46900776.0, + "3": 77860808.0, + "4": 237329376.0, + "5": 471709792.0, + "6": 558041536.0, + "7": 948826176.0, + "8": 723939584.0, + "9": 786891776.0, + "10": 734021888.0, + "11": 688478400.0, + "12": 553228736.0, + "13": 608009792.0, + "14": 741806976.0, + "15": 766532736.0, + "16": 685280512.0, + "17": 654899648.0, + "18": 730146112.0, + "19": 751163904.0, + "20": 884406592.0, + "21": 723541120.0, + "22": 805299648.0, + "23": 789975808.0, + "24": 610294016.0, + "25": 830610048.0, + "26": 824111232.0, + "27": 757678144.0, + "28": 774057088.0, + "29": 805232640.0, + "30": 770995712.0, + "31": 801384640.0, + "32": 790830656.0, + "33": 758341184.0, + "34": 726777280.0, + "35": 750934144.0, + "36": 717880064.0, + "37": 740480704.0, + "38": 724556544.0, + "39": 710957376.0, + "40": 716765760.0, + "41": 531516928.0, + "42": 658507328.0, + "43": 676045888.0, + "44": 680286208.0, + "45": 606880576.0, + "46": 641672384.0, + "47": 633002368.0, + "48": 607136576.0, + "49": 430551968.0, + "50": 563263808.0 } }, "mem-allocated-bytes": { @@ -178,53 +178,53 @@ "1": 4919527424.0, "2": 5861408768.0, "3": 5861408768.0, - "4": 5863651328.0, - "5": 5863651328.0, - "6": 5863651328.0, - "7": 5863651328.0, - "8": 5863651328.0, - "9": 5863651328.0, - "10": 5863651328.0, - "11": 5863651328.0, - "12": 5863651328.0, - "13": 5863651328.0, - "14": 5863651328.0, - "15": 5863986176.0, - "16": 5865795072.0, - "17": 5865795072.0, - "18": 5865795072.0, - "19": 5865795072.0, - "20": 5865795072.0, - "21": 5866987520.0, - "22": 5866987520.0, - "23": 5866987520.0, - "24": 5866987520.0, - "25": 5866987520.0, - "26": 5866987520.0, - "27": 5866987520.0, - "28": 5866987520.0, - "29": 5866987520.0, - "30": 5866987520.0, - "31": 5866987520.0, - "32": 5866987520.0, - "33": 5866987520.0, - "34": 5866987520.0, - "35": 5866987520.0, - "36": 5866987520.0, - "37": 5866987520.0, - "38": 5866987520.0, - "39": 5866987520.0, - "40": 5866987520.0, - "41": 5866987520.0, - "42": 5866987520.0, - "43": 5866987520.0, - "44": 5866987520.0, - "45": 5866987520.0, - "46": 5866987520.0, - "47": 5866987520.0, - "48": 5866987520.0, - "49": 5866987520.0, - "50": 5866987520.0 + "4": 5865549824.0, + "5": 5865549824.0, + "6": 5865549824.0, + "7": 5865549824.0, + "8": 5865549824.0, + "9": 5865549824.0, + "10": 5865549824.0, + "11": 5865549824.0, + "12": 5865549824.0, + "13": 5865549824.0, + "14": 5865549824.0, + "15": 5865549824.0, + "16": 5865549824.0, + "17": 5865549824.0, + "18": 5865549824.0, + "19": 5866154496.0, + "20": 5866154496.0, + "21": 5866154496.0, + "22": 5866154496.0, + "23": 5866154496.0, + "24": 5866154496.0, + "25": 5866154496.0, + "26": 5866154496.0, + "27": 5866154496.0, + "28": 5866154496.0, + "29": 5866154496.0, + "30": 5866154496.0, + "31": 5866154496.0, + "32": 5866154496.0, + "33": 5866154496.0, + "34": 5866154496.0, + "35": 5866154496.0, + "36": 5866154496.0, + "37": 5866154496.0, + "38": 5866154496.0, + "39": 5866154496.0, + "40": 5866154496.0, + "41": 5866154496.0, + "42": 5866154496.0, + "43": 5866154496.0, + "44": 5866154496.0, + "45": 5866154496.0, + "46": 5866154496.0, + "47": 5866154496.0, + "48": 5866154496.0, + "49": 5866154496.0, + "50": 5866154496.0 } }, "iteration-time": { @@ -232,56 +232,56 @@ "end_step": 50, "step_interval": 1, "values": { - "1": 72.699, - "2": 4.27015, - "3": 3.87365, - "4": 3.67041, - "5": 3.65964, - "6": 3.48532, - "7": 3.47679, - "8": 3.47349, - "9": 3.43879, - "10": 3.47441, - "11": 3.45737, - "12": 3.48691, - "13": 3.54474, - "14": 3.44102, - "15": 3.42127, - "16": 3.45795, - "17": 3.49717, - "18": 3.51293, - "19": 3.5617, - "20": 3.49733, - "21": 3.50336, - "22": 3.62308, - "23": 3.50166, - "24": 3.49075, - "25": 3.50996, - "26": 3.44423, - "27": 3.47323, - "28": 3.53784, - "29": 3.51989, - "30": 3.49211, - "31": 3.49945, - "32": 3.4419, - "33": 3.50458, - "34": 3.47663, - "35": 3.45702, - "36": 3.50281, - "37": 3.44136, - "38": 3.45165, - "39": 3.50095, - "40": 3.50126, - "41": 3.50863, - "42": 3.46684, - "43": 3.55122, - "44": 3.48372, - "45": 3.46903, - "46": 3.47654, - "47": 3.51574, - "48": 3.4895, - "49": 3.49404, - "50": 3.45824 + "1": 86.37903, + "2": 4.30499, + "3": 5.51749, + "4": 4.16842, + "5": 5.35652, + "6": 3.7018, + "7": 3.68633, + "8": 3.75304, + "9": 3.67596, + "10": 3.70408, + "11": 3.70621, + "12": 3.71713, + "13": 3.73785, + "14": 3.64923, + "15": 3.63825, + "16": 3.64129, + "17": 3.71791, + "18": 3.69956, + "19": 4.27786, + "20": 4.04035, + "21": 3.67423, + "22": 3.66455, + "23": 3.67758, + "24": 4.16675, + "25": 3.71546, + "26": 3.71205, + "27": 3.71193, + "28": 3.60188, + "29": 3.69233, + "30": 3.68235, + "31": 3.69734, + "32": 3.69173, + "33": 3.64974, + "34": 3.73647, + "35": 3.68627, + "36": 3.70357, + "37": 3.71094, + "38": 3.72508, + "39": 3.70553, + "40": 3.6995, + "41": 3.61312, + "42": 3.63624, + "43": 3.68714, + "44": 3.70371, + "45": 3.67257, + "46": 3.73701, + "47": 3.69639, + "48": 3.65815, + "49": 3.63754, + "50": 3.71569 } } } \ No newline at end of file From bcc9fba4d33cdb79fd2c4c7c57556738d4d46bc1 Mon Sep 17 00:00:00 2001 From: Hongbin Liu Date: Mon, 17 Nov 2025 19:53:37 -0800 Subject: [PATCH 09/10] support new TE version Signed-off-by: Hongbin Liu --- megatron/core/extensions/transformer_engine.py | 9 +++++---- .../fine_grained_activation_offload.py | 12 +++++------- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/megatron/core/extensions/transformer_engine.py b/megatron/core/extensions/transformer_engine.py index 808ac14a2e..b444eaff9a 100644 --- a/megatron/core/extensions/transformer_engine.py +++ b/megatron/core/extensions/transformer_engine.py @@ -2163,8 +2163,9 @@ def set_save_original_input(module): try: # pylint: disable=unused-import - from transformer_engine.pytorch import cpu_offload - from transformer_engine.pytorch.float8_tensor import Float8Tensor + from transformer_engine.pytorch import cpu_offload_v1 as cpu_offload except ImportError: - Float8Tensor = None - cpu_offload = None + try: + from transformer_engine.pytorch import cpu_offload + except ImportError: + cpu_offload = None diff --git a/megatron/core/pipeline_parallel/fine_grained_activation_offload.py b/megatron/core/pipeline_parallel/fine_grained_activation_offload.py index 1e280a09d3..a818d8486e 100644 --- a/megatron/core/pipeline_parallel/fine_grained_activation_offload.py +++ b/megatron/core/pipeline_parallel/fine_grained_activation_offload.py @@ -200,6 +200,8 @@ def __enter__(self): if cpu_offload is not None: cpu_offload.CPUOffloadEnabled = True + else: + raise RuntimeError("TE CPU offload is not available") self.inside_context = True torch._C._autograd._push_saved_tensors_default_hooks( @@ -213,6 +215,8 @@ def __exit__(self, *args: Any): if cpu_offload is not None: cpu_offload.CPUOffloadEnabled = False + else: + raise RuntimeError("TE CPU offload is not available") self.inside_context = False torch._C._autograd._pop_saved_tensors_default_hooks() @@ -244,24 +248,18 @@ class ChunkOffloadHandler: def offload(src_tensor, pin_memory=True): """Offload.""" debug_rank("--------offload") - from megatron.core.extensions.transformer_engine import Float8Tensor - - fp8_offload = isinstance(src_tensor, Float8Tensor) if Float8Tensor is not None else False if not src_tensor.is_contiguous(): src_tensor = src_tensor.contiguous() cpu_backup = torch.empty( src_tensor.size(), - dtype=torch.uint8 if fp8_offload else src_tensor.dtype, + dtype=src_tensor.dtype, layout=src_tensor.layout, device="cpu", pin_memory=pin_memory, ) - if fp8_offload: - cpu_backup = Float8Tensor.make_like(src_tensor, data=cpu_backup) - cpu_backup.copy_(src_tensor, non_blocking=pin_memory) state = (src_tensor.device, cpu_backup) return state From 8d1e509d72ba6a5c38d64d79d28bc9436f6b9eec Mon Sep 17 00:00:00 2001 From: Hongbin Liu Date: Mon, 17 Nov 2025 20:37:04 -0800 Subject: [PATCH 10/10] move set_ideal_affinity_for_current_gpu to pipeline/util Signed-off-by: Hongbin Liu --- .../fine_grained_activation_offload.py | 35 ++----------------- megatron/core/pipeline_parallel/utils.py | 33 +++++++++++++++++ 2 files changed, 35 insertions(+), 33 deletions(-) diff --git a/megatron/core/pipeline_parallel/fine_grained_activation_offload.py b/megatron/core/pipeline_parallel/fine_grained_activation_offload.py index a818d8486e..8397c51988 100644 --- a/megatron/core/pipeline_parallel/fine_grained_activation_offload.py +++ b/megatron/core/pipeline_parallel/fine_grained_activation_offload.py @@ -7,6 +7,8 @@ import torch +from megatron.core.pipeline_parallel.utils import set_ideal_affinity_for_current_gpu + # CPU offload implementation for pipeline parallelism DEBUG = False DEBUG_RANK = 0 @@ -22,39 +24,6 @@ def debug_rank(message): print(message) -def set_ideal_affinity_for_current_gpu(): - """Set CPU affinity for the current GPU to optimize host-device transfers.""" - import uuid - - try: - import cuda.bindings.driver as cuda_driver - import cuda.bindings.runtime as cuda_runtime - except ImportError: - try: - import cuda.cuda as cuda_driver - import cuda.cudart as cuda_runtime - except ImportError: - # print("cuda-python may not be installed, skipping GPU affinity setting") - warnings.warn("cuda-python may not be installed, skipping GPU affinity setting") - return - try: - import pynvml - except ImportError: - warnings.warn("pynvml is not installed, skipping GPU affinity setting") - return - - # Get current CUDA device ID - err, device_id = cuda_runtime.cudaGetDevice() - assert err == cuda_runtime.cudaError_t.cudaSuccess - # Get device UUID - err, device_uuid = cuda_driver.cuDeviceGetUuid(device_id) - assert err == cuda_driver.CUresult.CUDA_SUCCESS - # Set CPU affinity based on GPU's NUMA node - pynvml.nvmlInit() - handle = pynvml.nvmlDeviceGetHandleByUUID("GPU-" + str(uuid.UUID(bytes=device_uuid.bytes))) - pynvml.nvmlDeviceSetCpuAffinity(handle) - - class PipelineOffloadManager: """ Singleton manager for coordinating activation offloading across pipeline stages. diff --git a/megatron/core/pipeline_parallel/utils.py b/megatron/core/pipeline_parallel/utils.py index fae8e5466d..c50c6ac796 100644 --- a/megatron/core/pipeline_parallel/utils.py +++ b/megatron/core/pipeline_parallel/utils.py @@ -80,6 +80,39 @@ def make_viewless(e): return e +def set_ideal_affinity_for_current_gpu(): + """Set CPU affinity for the current GPU to optimize host-device transfers.""" + import uuid + + try: + import cuda.bindings.driver as cuda_driver + import cuda.bindings.runtime as cuda_runtime + except ImportError: + try: + import cuda.cuda as cuda_driver + import cuda.cudart as cuda_runtime + except ImportError: + # print("cuda-python may not be installed, skipping GPU affinity setting") + warnings.warn("cuda-python may not be installed, skipping GPU affinity setting") + return + try: + import pynvml + except ImportError: + warnings.warn("pynvml is not installed, skipping GPU affinity setting") + return + + # Get current CUDA device ID + err, device_id = cuda_runtime.cudaGetDevice() + assert err == cuda_runtime.cudaError_t.cudaSuccess + # Get device UUID + err, device_uuid = cuda_driver.cuDeviceGetUuid(device_id) + assert err == cuda_driver.CUresult.CUDA_SUCCESS + # Set CPU affinity based on GPU's NUMA node + pynvml.nvmlInit() + handle = pynvml.nvmlDeviceGetHandleByUUID("GPU-" + str(uuid.UUID(bytes=device_uuid.bytes))) + pynvml.nvmlDeviceSetCpuAffinity(handle) + + @contextmanager def stream_acquire_context(stream, event): """Stream acquire context"""