Azure · guoshzhao · Jul 17, 2025 · Jul 3, 2025 · Jul 4, 2025 · Jul 11, 2025
diff --git a/gpt3/README.md b/gpt3/README.md
@@ -101,3 +101,29 @@ If you want to train GPT-3 with Megatron-LM using multiple nodes, you need:
   export NCCL_DEBUG=WARN
   ```
 - Use a parallel ssh tool to start the script on all nodes.
+
+## Using FP4 Training
+If you want to reproduce the training result from paper: [Optimizing Large Language Model Training Using FP4 Quantization](https://arxiv.org/abs/2501.17116), please do the following steps.
+
+1. Install [MS-AMP](https://github.com/Azure/MS-AMP). We recommand you [install from source](https://azure.github.io/MS-AMP/docs/getting-started/installation#install-from-source). If you're using the pre-installed MS-AMP [Docker container](https://azure.github.io/MS-AMP/docs/user-tutorial/container-images), you need to rebuild the msamp package to suppport the fp4 quantization feature using the following command:
+  ```bash
+  cd MS-AMP
+  python3 -m pip install .
+  make postinstall
+  ```
+2. Install dependencies, Data preparation and Apply patch to Megatron-LM (following steps previously shown in this README file)
+3. Launch script:
+  ```bash
+  # for 345M model
+  bash pretrain_345m_megatron.sh fp4
+  # for 6.7B model
+  bash pretrain_6b7_megatron.sh fp4
+  # for 13B model
+  bash pretrain_13b_megatron.sh fp4
+  ```
+  In these scripts, you can adjust the environment variables to control the way of FP4 quantization. These are:
+  ```txt
+  export MSAMP_USE_WEIGHT_SIMULATE_FP4=1                          # control if weight quantization is used
+  export MSAMP_USE_WEIGHT_DIFFERENTIABLE_GRADIENT_ESTIMATOR=1     # control if DGE(Differentiable Gradient Estimator) is used
+  export MSAMP_USE_ACTIVATION_SIMULATE_FP4=1                      # control if activation quantization is used
+  ```
diff --git a/gpt3/pretrain_13b_megatron.sh b/gpt3/pretrain_13b_megatron.sh
@@ -5,7 +5,7 @@
 
 set -e
 
-USAGE="usage: bash pretrain_13b_megatron.sh [bf16|te|msamp]"
+USAGE="usage: bash pretrain_13b_megatron.sh [bf16|te|msamp|fp4]"
 
 if [ "$#" -ne 1 ]; then
   echo $USAGE
@@ -114,6 +114,20 @@ elif [ "$FP_TYPE" = "msamp" ]; then
         --msamp \
         --save $CHECKPOINT_PATH \
         --load $CHECKPOINT_PATH
+elif [ "$FP_TYPE" = "fp4" ]; then
+    CHECKPOINT_PATH=$PWD/checkpoints/gpt_13b_fp4
+    export MSAMP_USE_WEIGHT_SIMULATE_FP4=1
+    export MSAMP_USE_WEIGHT_DIFFERENTIABLE_GRADIENT_ESTIMATOR=1
+    export MSAMP_USE_ACTIVATION_SIMULATE_FP4=1
+    torchrun $DISTRIBUTED_ARGS ../third_party/Megatron-LM/pretrain_gpt.py \
+        $GPT_ARGS \
+        $DATA_ARGS \
+        $OUTPUT_ARGS \
+        --fp8-hybrid \
+        --transformer-impl transformer_engine \
+        --msamp \
+        --save $CHECKPOINT_PATH \
+        --load $CHECKPOINT_PATH
 else
     echo $USAGE
     exit 1

diff --git a/gpt3/pretrain_345m_megatron.sh b/gpt3/pretrain_345m_megatron.sh
@@ -4,7 +4,7 @@
 
 set -e
 
-USAGE="usage: bash pretrain_345m_megatron.sh [bf16|te|msamp]"
+USAGE="usage: bash pretrain_345m_megatron.sh [bf16|te|msamp|fp4]"
 
 if [ "$#" -ne 1 ]; then
   echo $USAGE
@@ -102,6 +102,21 @@ elif [ "$FP_TYPE" = "msamp" ]; then
         --msamp \
         --save $CHECKPOINT_PATH \
         --load $CHECKPOINT_PATH
+
+elif [ "$FP_TYPE" = "fp4" ]; then
+    CHECKPOINT_PATH=$PWD/checkpoints/gpt_345m_fp4
+    export MSAMP_USE_WEIGHT_SIMULATE_FP4=1
+    export MSAMP_USE_WEIGHT_DIFFERENTIABLE_GRADIENT_ESTIMATOR=1
+    export MSAMP_USE_ACTIVATION_SIMULATE_FP4=1
+    torchrun $DISTRIBUTED_ARGS ../third_party/Megatron-LM/pretrain_gpt.py \
+        $GPT_ARGS \
+        $DATA_ARGS \
+        $OUTPUT_ARGS \
+        --fp8-hybrid \
+        --transformer-impl transformer_engine \
+        --msamp \
+        --save $CHECKPOINT_PATH \
+        --load $CHECKPOINT_PATH
 else
     echo $USAGE
     exit 1

diff --git a/gpt3/pretrain_6b7_megatron.sh b/gpt3/pretrain_6b7_megatron.sh
@@ -5,7 +5,7 @@
 
 set -e
 
-USAGE="usage: bash pretrain_6b7_megatron.sh [bf16|te|msamp]"
+USAGE="usage: bash pretrain_6b7_megatron.sh [bf16|te|msamp|fp4]"
 
 if [ "$#" -ne 1 ]; then
   echo $USAGE
@@ -115,6 +115,20 @@ elif [ "$FP_TYPE" = "msamp" ]; then
         --fp8-hybrid \
         --transformer-impl transformer_engine \
         --msamp
+elif [ "$FP_TYPE" = "fp4" ]; then
+    CHECKPOINT_PATH=$PWD/checkpoints/gpt_6b7_fp4
+    export MSAMP_USE_WEIGHT_SIMULATE_FP4=1
+    export MSAMP_USE_WEIGHT_DIFFERENTIABLE_GRADIENT_ESTIMATOR=1
+    export MSAMP_USE_ACTIVATION_SIMULATE_FP4=1
+    torchrun $DISTRIBUTED_ARGS ../third_party/Megatron-LM/pretrain_gpt.py \
+        $GPT_ARGS \
+        $DATA_ARGS \
+        $OUTPUT_ARGS \
+        --save $CHECKPOINT_PATH \
+        --load $CHECKPOINT_PATH \
+        --fp8-hybrid \
+        --transformer-impl transformer_engine \
+        --msamp
 else
     echo $USAGE
     exit 1