From 11336411d9ea2973e8516e39309e9abfaa5c5698 Mon Sep 17 00:00:00 2001
From: Mr-Philo <1347549342@qq.com>
Date: Thu, 3 Jul 2025 06:14:22 +0000
Subject: [PATCH 1/3] add example training script in fp4 format

---
 gpt3/pretrain_345m_megatron.sh | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/gpt3/pretrain_345m_megatron.sh b/gpt3/pretrain_345m_megatron.sh
index be1651b..d97a312 100644
--- a/gpt3/pretrain_345m_megatron.sh
+++ b/gpt3/pretrain_345m_megatron.sh
@@ -4,7 +4,7 @@
 
 set -e
 
-USAGE="usage: bash pretrain_345m_megatron.sh [bf16|te|msamp]"
+USAGE="usage: bash pretrain_345m_megatron.sh [bf16|te|msamp|fp4]"
 
 if [ "$#" -ne 1 ]; then
   echo $USAGE
@@ -102,6 +102,21 @@ elif [ "$FP_TYPE" = "msamp" ]; then
         --msamp \
         --save $CHECKPOINT_PATH \
         --load $CHECKPOINT_PATH
+
+elif [ "$FP_TYPE" = "fp4" ]; then
+    CHECKPOINT_PATH=$PWD/checkpoints/gpt_345m_fp4
+    export USE_W_SIMU_FP4=1
+    export USE_W_DIFFERENTIABLE_GRADIENT_ESTIMATOR=1
+    export USE_A_SIMU_FP4=1
+    torchrun $DISTRIBUTED_ARGS ../third_party/Megatron-LM/pretrain_gpt.py \
+        $GPT_ARGS \
+        $DATA_ARGS \
+        $OUTPUT_ARGS \
+        --fp8-hybrid \
+        --transformer-impl transformer_engine \
+        --msamp \
+        --save $CHECKPOINT_PATH \
+        --load $CHECKPOINT_PATH
 else
     echo $USAGE
     exit 1

From f607e13b6a55b51f10742c4130834fc9783c86af Mon Sep 17 00:00:00 2001
From: Mr-Philo <1347549342@qq.com>
Date: Fri, 4 Jul 2025 07:33:00 +0000
Subject: [PATCH 2/3] add reproduce instructions

---
 gpt3/README.md                | 25 +++++++++++++++++++++++++
 gpt3/pretrain_13b_megatron.sh | 16 +++++++++++++++-
 gpt3/pretrain_6b7_megatron.sh | 16 +++++++++++++++-
 3 files changed, 55 insertions(+), 2 deletions(-)

diff --git a/gpt3/README.md b/gpt3/README.md
index 659cb62..3055bc2 100644
--- a/gpt3/README.md
+++ b/gpt3/README.md
@@ -101,3 +101,28 @@ If you want to train GPT-3 with Megatron-LM using multiple nodes, you need:
   export NCCL_DEBUG=WARN
   ```
 - Use a parallel ssh tool to start the script on all nodes.
+
+## Using FP4 Training
+If you want to reproduce the training result from paper: [Optimizing Large Language Model Training Using FP4 Quantization](https://arxiv.org/abs/2501.17116), please do the following steps.
+
+1. Install [MS-AMP](https://github.com/Azure/MS-AMP). We recommand you [install from source](https://azure.github.io/MS-AMP/docs/getting-started/installation#install-from-source). If you're using the pre-installed MS-AMP [Docker container](https://azure.github.io/MS-AMP/docs/user-tutorial/container-images), you need to rebuild the msamp package to suppport the fp4 quantization feature using the following command:
+  ```bash
+  python3 -m pip install .
+  make postinstall
+  ```
+2. Install dependencies, Data preparation and Apply patch to Megatron-LM (following steps previously shown in this README file)
+3. Launch script:
+  ```bash
+  # for 345M model
+  bash pretrain_345m_megatron.sh fp4
+  # for 6.7B model
+  bash pretrain_6b7_megatron.sh fp4
+  # for 13B model
+  bash pretrain_13b_megatron.sh fp4
+  ```
+  In these scripts, you can adjust the environment variables to control the way of FP4 quantization. These are:
+  ```txt
+  export USE_W_SIMU_FP4=1   # control if weight quantization is used
+  export USE_W_DIFFERENTIABLE_GRADIENT_ESTIMATOR=1    # control if DGE(Differentiable Gradient Estimator) is used
+  export USE_A_SIMU_FP4=1   # control if activation quantization is used
+  ```
\ No newline at end of file
diff --git a/gpt3/pretrain_13b_megatron.sh b/gpt3/pretrain_13b_megatron.sh
index 442c154..2b679ef 100644
--- a/gpt3/pretrain_13b_megatron.sh
+++ b/gpt3/pretrain_13b_megatron.sh
@@ -5,7 +5,7 @@
 
 set -e
 
-USAGE="usage: bash pretrain_13b_megatron.sh [bf16|te|msamp]"
+USAGE="usage: bash pretrain_13b_megatron.sh [bf16|te|msamp|fp4]"
 
 if [ "$#" -ne 1 ]; then
   echo $USAGE
@@ -114,6 +114,20 @@ elif [ "$FP_TYPE" = "msamp" ]; then
         --msamp \
         --save $CHECKPOINT_PATH \
         --load $CHECKPOINT_PATH
+elif [ "$FP_TYPE" = "fp4" ]; then
+    CHECKPOINT_PATH=$PWD/checkpoints/gpt_13b_fp4
+    export USE_W_SIMU_FP4=1
+    export USE_W_DIFFERENTIABLE_GRADIENT_ESTIMATOR=1
+    export USE_A_SIMU_FP4=1
+    torchrun $DISTRIBUTED_ARGS ../third_party/Megatron-LM/pretrain_gpt.py \
+        $GPT_ARGS \
+        $DATA_ARGS \
+        $OUTPUT_ARGS \
+        --fp8-hybrid \
+        --transformer-impl transformer_engine \
+        --msamp \
+        --save $CHECKPOINT_PATH \
+        --load $CHECKPOINT_PATH
 else
     echo $USAGE
     exit 1
diff --git a/gpt3/pretrain_6b7_megatron.sh b/gpt3/pretrain_6b7_megatron.sh
index 3ba391c..bdf4902 100644
--- a/gpt3/pretrain_6b7_megatron.sh
+++ b/gpt3/pretrain_6b7_megatron.sh
@@ -5,7 +5,7 @@
 
 set -e
 
-USAGE="usage: bash pretrain_6b7_megatron.sh [bf16|te|msamp]"
+USAGE="usage: bash pretrain_6b7_megatron.sh [bf16|te|msamp|fp4]"
 
 if [ "$#" -ne 1 ]; then
   echo $USAGE
@@ -115,6 +115,20 @@ elif [ "$FP_TYPE" = "msamp" ]; then
         --fp8-hybrid \
         --transformer-impl transformer_engine \
         --msamp
+elif [ "$FP_TYPE" = "fp4" ]; then
+    CHECKPOINT_PATH=$PWD/checkpoints/gpt_6b7_fp4
+    export USE_W_SIMU_FP4=1
+    export USE_W_DIFFERENTIABLE_GRADIENT_ESTIMATOR=1
+    export USE_A_SIMU_FP4=1
+    torchrun $DISTRIBUTED_ARGS ../third_party/Megatron-LM/pretrain_gpt.py \
+        $GPT_ARGS \
+        $DATA_ARGS \
+        $OUTPUT_ARGS \
+        --save $CHECKPOINT_PATH \
+        --load $CHECKPOINT_PATH \
+        --fp8-hybrid \
+        --transformer-impl transformer_engine \
+        --msamp
 else
     echo $USAGE
     exit 1

From ff5388e90a46f1fff4daec5406664b4038dfce34 Mon Sep 17 00:00:00 2001
From: Mr-Philo <1347549342@qq.com>
Date: Fri, 11 Jul 2025 03:17:49 +0000
Subject: [PATCH 3/3] update names for env variables

---
 gpt3/README.md                 | 7 ++++---
 gpt3/pretrain_13b_megatron.sh  | 6 +++---
 gpt3/pretrain_345m_megatron.sh | 6 +++---
 gpt3/pretrain_6b7_megatron.sh  | 6 +++---
 4 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/gpt3/README.md b/gpt3/README.md
index 3055bc2..2121b66 100644
--- a/gpt3/README.md
+++ b/gpt3/README.md
@@ -107,6 +107,7 @@ If you want to reproduce the training result from paper: [Optimizing Large Langu
 
 1. Install [MS-AMP](https://github.com/Azure/MS-AMP). We recommand you [install from source](https://azure.github.io/MS-AMP/docs/getting-started/installation#install-from-source). If you're using the pre-installed MS-AMP [Docker container](https://azure.github.io/MS-AMP/docs/user-tutorial/container-images), you need to rebuild the msamp package to suppport the fp4 quantization feature using the following command:
   ```bash
+  cd MS-AMP
   python3 -m pip install .
   make postinstall
   ```
@@ -122,7 +123,7 @@ If you want to reproduce the training result from paper: [Optimizing Large Langu
   ```
   In these scripts, you can adjust the environment variables to control the way of FP4 quantization. These are:
   ```txt
-  export USE_W_SIMU_FP4=1   # control if weight quantization is used
-  export USE_W_DIFFERENTIABLE_GRADIENT_ESTIMATOR=1    # control if DGE(Differentiable Gradient Estimator) is used
-  export USE_A_SIMU_FP4=1   # control if activation quantization is used
+  export MSAMP_USE_WEIGHT_SIMULATE_FP4=1                          # control if weight quantization is used
+  export MSAMP_USE_WEIGHT_DIFFERENTIABLE_GRADIENT_ESTIMATOR=1     # control if DGE(Differentiable Gradient Estimator) is used
+  export MSAMP_USE_ACTIVATION_SIMULATE_FP4=1                      # control if activation quantization is used
   ```
\ No newline at end of file
diff --git a/gpt3/pretrain_13b_megatron.sh b/gpt3/pretrain_13b_megatron.sh
index 2b679ef..4f9a13b 100644
--- a/gpt3/pretrain_13b_megatron.sh
+++ b/gpt3/pretrain_13b_megatron.sh
@@ -116,9 +116,9 @@ elif [ "$FP_TYPE" = "msamp" ]; then
         --load $CHECKPOINT_PATH
 elif [ "$FP_TYPE" = "fp4" ]; then
     CHECKPOINT_PATH=$PWD/checkpoints/gpt_13b_fp4
-    export USE_W_SIMU_FP4=1
-    export USE_W_DIFFERENTIABLE_GRADIENT_ESTIMATOR=1
-    export USE_A_SIMU_FP4=1
+    export MSAMP_USE_WEIGHT_SIMULATE_FP4=1
+    export MSAMP_USE_WEIGHT_DIFFERENTIABLE_GRADIENT_ESTIMATOR=1
+    export MSAMP_USE_ACTIVATION_SIMULATE_FP4=1
     torchrun $DISTRIBUTED_ARGS ../third_party/Megatron-LM/pretrain_gpt.py \
         $GPT_ARGS \
         $DATA_ARGS \
diff --git a/gpt3/pretrain_345m_megatron.sh b/gpt3/pretrain_345m_megatron.sh
index d97a312..211a3ba 100644
--- a/gpt3/pretrain_345m_megatron.sh
+++ b/gpt3/pretrain_345m_megatron.sh
@@ -105,9 +105,9 @@ elif [ "$FP_TYPE" = "msamp" ]; then
 
 elif [ "$FP_TYPE" = "fp4" ]; then
     CHECKPOINT_PATH=$PWD/checkpoints/gpt_345m_fp4
-    export USE_W_SIMU_FP4=1
-    export USE_W_DIFFERENTIABLE_GRADIENT_ESTIMATOR=1
-    export USE_A_SIMU_FP4=1
+    export MSAMP_USE_WEIGHT_SIMULATE_FP4=1
+    export MSAMP_USE_WEIGHT_DIFFERENTIABLE_GRADIENT_ESTIMATOR=1
+    export MSAMP_USE_ACTIVATION_SIMULATE_FP4=1
     torchrun $DISTRIBUTED_ARGS ../third_party/Megatron-LM/pretrain_gpt.py \
         $GPT_ARGS \
         $DATA_ARGS \
diff --git a/gpt3/pretrain_6b7_megatron.sh b/gpt3/pretrain_6b7_megatron.sh
index bdf4902..df2f071 100644
--- a/gpt3/pretrain_6b7_megatron.sh
+++ b/gpt3/pretrain_6b7_megatron.sh
@@ -117,9 +117,9 @@ elif [ "$FP_TYPE" = "msamp" ]; then
         --msamp
 elif [ "$FP_TYPE" = "fp4" ]; then
     CHECKPOINT_PATH=$PWD/checkpoints/gpt_6b7_fp4
-    export USE_W_SIMU_FP4=1
-    export USE_W_DIFFERENTIABLE_GRADIENT_ESTIMATOR=1
-    export USE_A_SIMU_FP4=1
+    export MSAMP_USE_WEIGHT_SIMULATE_FP4=1
+    export MSAMP_USE_WEIGHT_DIFFERENTIABLE_GRADIENT_ESTIMATOR=1
+    export MSAMP_USE_ACTIVATION_SIMULATE_FP4=1
     torchrun $DISTRIBUTED_ARGS ../third_party/Megatron-LM/pretrain_gpt.py \
         $GPT_ARGS \
         $DATA_ARGS \