From 11336411d9ea2973e8516e39309e9abfaa5c5698 Mon Sep 17 00:00:00 2001 From: Mr-Philo <1347549342@qq.com> Date: Thu, 3 Jul 2025 06:14:22 +0000 Subject: [PATCH 1/3] add example training script in fp4 format --- gpt3/pretrain_345m_megatron.sh | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/gpt3/pretrain_345m_megatron.sh b/gpt3/pretrain_345m_megatron.sh index be1651b..d97a312 100644 --- a/gpt3/pretrain_345m_megatron.sh +++ b/gpt3/pretrain_345m_megatron.sh @@ -4,7 +4,7 @@ set -e -USAGE="usage: bash pretrain_345m_megatron.sh [bf16|te|msamp]" +USAGE="usage: bash pretrain_345m_megatron.sh [bf16|te|msamp|fp4]" if [ "$#" -ne 1 ]; then echo $USAGE @@ -102,6 +102,21 @@ elif [ "$FP_TYPE" = "msamp" ]; then --msamp \ --save $CHECKPOINT_PATH \ --load $CHECKPOINT_PATH + +elif [ "$FP_TYPE" = "fp4" ]; then + CHECKPOINT_PATH=$PWD/checkpoints/gpt_345m_fp4 + export USE_W_SIMU_FP4=1 + export USE_W_DIFFERENTIABLE_GRADIENT_ESTIMATOR=1 + export USE_A_SIMU_FP4=1 + torchrun $DISTRIBUTED_ARGS ../third_party/Megatron-LM/pretrain_gpt.py \ + $GPT_ARGS \ + $DATA_ARGS \ + $OUTPUT_ARGS \ + --fp8-hybrid \ + --transformer-impl transformer_engine \ + --msamp \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH else echo $USAGE exit 1 From f607e13b6a55b51f10742c4130834fc9783c86af Mon Sep 17 00:00:00 2001 From: Mr-Philo <1347549342@qq.com> Date: Fri, 4 Jul 2025 07:33:00 +0000 Subject: [PATCH 2/3] add reproduce instructions --- gpt3/README.md | 25 +++++++++++++++++++++++++ gpt3/pretrain_13b_megatron.sh | 16 +++++++++++++++- gpt3/pretrain_6b7_megatron.sh | 16 +++++++++++++++- 3 files changed, 55 insertions(+), 2 deletions(-) diff --git a/gpt3/README.md b/gpt3/README.md index 659cb62..3055bc2 100644 --- a/gpt3/README.md +++ b/gpt3/README.md @@ -101,3 +101,28 @@ If you want to train GPT-3 with Megatron-LM using multiple nodes, you need: export NCCL_DEBUG=WARN ``` - Use a parallel ssh tool to start the script on all nodes. + +## Using FP4 Training +If you want to reproduce the training result from paper: [Optimizing Large Language Model Training Using FP4 Quantization](https://arxiv.org/abs/2501.17116), please do the following steps. + +1. Install [MS-AMP](https://github.com/Azure/MS-AMP). We recommand you [install from source](https://azure.github.io/MS-AMP/docs/getting-started/installation#install-from-source). If you're using the pre-installed MS-AMP [Docker container](https://azure.github.io/MS-AMP/docs/user-tutorial/container-images), you need to rebuild the msamp package to suppport the fp4 quantization feature using the following command: + ```bash + python3 -m pip install . + make postinstall + ``` +2. Install dependencies, Data preparation and Apply patch to Megatron-LM (following steps previously shown in this README file) +3. Launch script: + ```bash + # for 345M model + bash pretrain_345m_megatron.sh fp4 + # for 6.7B model + bash pretrain_6b7_megatron.sh fp4 + # for 13B model + bash pretrain_13b_megatron.sh fp4 + ``` + In these scripts, you can adjust the environment variables to control the way of FP4 quantization. These are: + ```txt + export USE_W_SIMU_FP4=1 # control if weight quantization is used + export USE_W_DIFFERENTIABLE_GRADIENT_ESTIMATOR=1 # control if DGE(Differentiable Gradient Estimator) is used + export USE_A_SIMU_FP4=1 # control if activation quantization is used + ``` \ No newline at end of file diff --git a/gpt3/pretrain_13b_megatron.sh b/gpt3/pretrain_13b_megatron.sh index 442c154..2b679ef 100644 --- a/gpt3/pretrain_13b_megatron.sh +++ b/gpt3/pretrain_13b_megatron.sh @@ -5,7 +5,7 @@ set -e -USAGE="usage: bash pretrain_13b_megatron.sh [bf16|te|msamp]" +USAGE="usage: bash pretrain_13b_megatron.sh [bf16|te|msamp|fp4]" if [ "$#" -ne 1 ]; then echo $USAGE @@ -114,6 +114,20 @@ elif [ "$FP_TYPE" = "msamp" ]; then --msamp \ --save $CHECKPOINT_PATH \ --load $CHECKPOINT_PATH +elif [ "$FP_TYPE" = "fp4" ]; then + CHECKPOINT_PATH=$PWD/checkpoints/gpt_13b_fp4 + export USE_W_SIMU_FP4=1 + export USE_W_DIFFERENTIABLE_GRADIENT_ESTIMATOR=1 + export USE_A_SIMU_FP4=1 + torchrun $DISTRIBUTED_ARGS ../third_party/Megatron-LM/pretrain_gpt.py \ + $GPT_ARGS \ + $DATA_ARGS \ + $OUTPUT_ARGS \ + --fp8-hybrid \ + --transformer-impl transformer_engine \ + --msamp \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH else echo $USAGE exit 1 diff --git a/gpt3/pretrain_6b7_megatron.sh b/gpt3/pretrain_6b7_megatron.sh index 3ba391c..bdf4902 100644 --- a/gpt3/pretrain_6b7_megatron.sh +++ b/gpt3/pretrain_6b7_megatron.sh @@ -5,7 +5,7 @@ set -e -USAGE="usage: bash pretrain_6b7_megatron.sh [bf16|te|msamp]" +USAGE="usage: bash pretrain_6b7_megatron.sh [bf16|te|msamp|fp4]" if [ "$#" -ne 1 ]; then echo $USAGE @@ -115,6 +115,20 @@ elif [ "$FP_TYPE" = "msamp" ]; then --fp8-hybrid \ --transformer-impl transformer_engine \ --msamp +elif [ "$FP_TYPE" = "fp4" ]; then + CHECKPOINT_PATH=$PWD/checkpoints/gpt_6b7_fp4 + export USE_W_SIMU_FP4=1 + export USE_W_DIFFERENTIABLE_GRADIENT_ESTIMATOR=1 + export USE_A_SIMU_FP4=1 + torchrun $DISTRIBUTED_ARGS ../third_party/Megatron-LM/pretrain_gpt.py \ + $GPT_ARGS \ + $DATA_ARGS \ + $OUTPUT_ARGS \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --fp8-hybrid \ + --transformer-impl transformer_engine \ + --msamp else echo $USAGE exit 1 From ff5388e90a46f1fff4daec5406664b4038dfce34 Mon Sep 17 00:00:00 2001 From: Mr-Philo <1347549342@qq.com> Date: Fri, 11 Jul 2025 03:17:49 +0000 Subject: [PATCH 3/3] update names for env variables --- gpt3/README.md | 7 ++++--- gpt3/pretrain_13b_megatron.sh | 6 +++--- gpt3/pretrain_345m_megatron.sh | 6 +++--- gpt3/pretrain_6b7_megatron.sh | 6 +++--- 4 files changed, 13 insertions(+), 12 deletions(-) diff --git a/gpt3/README.md b/gpt3/README.md index 3055bc2..2121b66 100644 --- a/gpt3/README.md +++ b/gpt3/README.md @@ -107,6 +107,7 @@ If you want to reproduce the training result from paper: [Optimizing Large Langu 1. Install [MS-AMP](https://github.com/Azure/MS-AMP). We recommand you [install from source](https://azure.github.io/MS-AMP/docs/getting-started/installation#install-from-source). If you're using the pre-installed MS-AMP [Docker container](https://azure.github.io/MS-AMP/docs/user-tutorial/container-images), you need to rebuild the msamp package to suppport the fp4 quantization feature using the following command: ```bash + cd MS-AMP python3 -m pip install . make postinstall ``` @@ -122,7 +123,7 @@ If you want to reproduce the training result from paper: [Optimizing Large Langu ``` In these scripts, you can adjust the environment variables to control the way of FP4 quantization. These are: ```txt - export USE_W_SIMU_FP4=1 # control if weight quantization is used - export USE_W_DIFFERENTIABLE_GRADIENT_ESTIMATOR=1 # control if DGE(Differentiable Gradient Estimator) is used - export USE_A_SIMU_FP4=1 # control if activation quantization is used + export MSAMP_USE_WEIGHT_SIMULATE_FP4=1 # control if weight quantization is used + export MSAMP_USE_WEIGHT_DIFFERENTIABLE_GRADIENT_ESTIMATOR=1 # control if DGE(Differentiable Gradient Estimator) is used + export MSAMP_USE_ACTIVATION_SIMULATE_FP4=1 # control if activation quantization is used ``` \ No newline at end of file diff --git a/gpt3/pretrain_13b_megatron.sh b/gpt3/pretrain_13b_megatron.sh index 2b679ef..4f9a13b 100644 --- a/gpt3/pretrain_13b_megatron.sh +++ b/gpt3/pretrain_13b_megatron.sh @@ -116,9 +116,9 @@ elif [ "$FP_TYPE" = "msamp" ]; then --load $CHECKPOINT_PATH elif [ "$FP_TYPE" = "fp4" ]; then CHECKPOINT_PATH=$PWD/checkpoints/gpt_13b_fp4 - export USE_W_SIMU_FP4=1 - export USE_W_DIFFERENTIABLE_GRADIENT_ESTIMATOR=1 - export USE_A_SIMU_FP4=1 + export MSAMP_USE_WEIGHT_SIMULATE_FP4=1 + export MSAMP_USE_WEIGHT_DIFFERENTIABLE_GRADIENT_ESTIMATOR=1 + export MSAMP_USE_ACTIVATION_SIMULATE_FP4=1 torchrun $DISTRIBUTED_ARGS ../third_party/Megatron-LM/pretrain_gpt.py \ $GPT_ARGS \ $DATA_ARGS \ diff --git a/gpt3/pretrain_345m_megatron.sh b/gpt3/pretrain_345m_megatron.sh index d97a312..211a3ba 100644 --- a/gpt3/pretrain_345m_megatron.sh +++ b/gpt3/pretrain_345m_megatron.sh @@ -105,9 +105,9 @@ elif [ "$FP_TYPE" = "msamp" ]; then elif [ "$FP_TYPE" = "fp4" ]; then CHECKPOINT_PATH=$PWD/checkpoints/gpt_345m_fp4 - export USE_W_SIMU_FP4=1 - export USE_W_DIFFERENTIABLE_GRADIENT_ESTIMATOR=1 - export USE_A_SIMU_FP4=1 + export MSAMP_USE_WEIGHT_SIMULATE_FP4=1 + export MSAMP_USE_WEIGHT_DIFFERENTIABLE_GRADIENT_ESTIMATOR=1 + export MSAMP_USE_ACTIVATION_SIMULATE_FP4=1 torchrun $DISTRIBUTED_ARGS ../third_party/Megatron-LM/pretrain_gpt.py \ $GPT_ARGS \ $DATA_ARGS \ diff --git a/gpt3/pretrain_6b7_megatron.sh b/gpt3/pretrain_6b7_megatron.sh index bdf4902..df2f071 100644 --- a/gpt3/pretrain_6b7_megatron.sh +++ b/gpt3/pretrain_6b7_megatron.sh @@ -117,9 +117,9 @@ elif [ "$FP_TYPE" = "msamp" ]; then --msamp elif [ "$FP_TYPE" = "fp4" ]; then CHECKPOINT_PATH=$PWD/checkpoints/gpt_6b7_fp4 - export USE_W_SIMU_FP4=1 - export USE_W_DIFFERENTIABLE_GRADIENT_ESTIMATOR=1 - export USE_A_SIMU_FP4=1 + export MSAMP_USE_WEIGHT_SIMULATE_FP4=1 + export MSAMP_USE_WEIGHT_DIFFERENTIABLE_GRADIENT_ESTIMATOR=1 + export MSAMP_USE_ACTIVATION_SIMULATE_FP4=1 torchrun $DISTRIBUTED_ARGS ../third_party/Megatron-LM/pretrain_gpt.py \ $GPT_ARGS \ $DATA_ARGS \