diff --git a/gpt3/README.md b/gpt3/README.md index 659cb62..2121b66 100644 --- a/gpt3/README.md +++ b/gpt3/README.md @@ -101,3 +101,29 @@ If you want to train GPT-3 with Megatron-LM using multiple nodes, you need: export NCCL_DEBUG=WARN ``` - Use a parallel ssh tool to start the script on all nodes. + +## Using FP4 Training +If you want to reproduce the training result from paper: [Optimizing Large Language Model Training Using FP4 Quantization](https://arxiv.org/abs/2501.17116), please do the following steps. + +1. Install [MS-AMP](https://github.com/Azure/MS-AMP). We recommand you [install from source](https://azure.github.io/MS-AMP/docs/getting-started/installation#install-from-source). If you're using the pre-installed MS-AMP [Docker container](https://azure.github.io/MS-AMP/docs/user-tutorial/container-images), you need to rebuild the msamp package to suppport the fp4 quantization feature using the following command: + ```bash + cd MS-AMP + python3 -m pip install . + make postinstall + ``` +2. Install dependencies, Data preparation and Apply patch to Megatron-LM (following steps previously shown in this README file) +3. Launch script: + ```bash + # for 345M model + bash pretrain_345m_megatron.sh fp4 + # for 6.7B model + bash pretrain_6b7_megatron.sh fp4 + # for 13B model + bash pretrain_13b_megatron.sh fp4 + ``` + In these scripts, you can adjust the environment variables to control the way of FP4 quantization. These are: + ```txt + export MSAMP_USE_WEIGHT_SIMULATE_FP4=1 # control if weight quantization is used + export MSAMP_USE_WEIGHT_DIFFERENTIABLE_GRADIENT_ESTIMATOR=1 # control if DGE(Differentiable Gradient Estimator) is used + export MSAMP_USE_ACTIVATION_SIMULATE_FP4=1 # control if activation quantization is used + ``` \ No newline at end of file diff --git a/gpt3/pretrain_13b_megatron.sh b/gpt3/pretrain_13b_megatron.sh index 442c154..4f9a13b 100644 --- a/gpt3/pretrain_13b_megatron.sh +++ b/gpt3/pretrain_13b_megatron.sh @@ -5,7 +5,7 @@ set -e -USAGE="usage: bash pretrain_13b_megatron.sh [bf16|te|msamp]" +USAGE="usage: bash pretrain_13b_megatron.sh [bf16|te|msamp|fp4]" if [ "$#" -ne 1 ]; then echo $USAGE @@ -114,6 +114,20 @@ elif [ "$FP_TYPE" = "msamp" ]; then --msamp \ --save $CHECKPOINT_PATH \ --load $CHECKPOINT_PATH +elif [ "$FP_TYPE" = "fp4" ]; then + CHECKPOINT_PATH=$PWD/checkpoints/gpt_13b_fp4 + export MSAMP_USE_WEIGHT_SIMULATE_FP4=1 + export MSAMP_USE_WEIGHT_DIFFERENTIABLE_GRADIENT_ESTIMATOR=1 + export MSAMP_USE_ACTIVATION_SIMULATE_FP4=1 + torchrun $DISTRIBUTED_ARGS ../third_party/Megatron-LM/pretrain_gpt.py \ + $GPT_ARGS \ + $DATA_ARGS \ + $OUTPUT_ARGS \ + --fp8-hybrid \ + --transformer-impl transformer_engine \ + --msamp \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH else echo $USAGE exit 1 diff --git a/gpt3/pretrain_345m_megatron.sh b/gpt3/pretrain_345m_megatron.sh index be1651b..211a3ba 100644 --- a/gpt3/pretrain_345m_megatron.sh +++ b/gpt3/pretrain_345m_megatron.sh @@ -4,7 +4,7 @@ set -e -USAGE="usage: bash pretrain_345m_megatron.sh [bf16|te|msamp]" +USAGE="usage: bash pretrain_345m_megatron.sh [bf16|te|msamp|fp4]" if [ "$#" -ne 1 ]; then echo $USAGE @@ -102,6 +102,21 @@ elif [ "$FP_TYPE" = "msamp" ]; then --msamp \ --save $CHECKPOINT_PATH \ --load $CHECKPOINT_PATH + +elif [ "$FP_TYPE" = "fp4" ]; then + CHECKPOINT_PATH=$PWD/checkpoints/gpt_345m_fp4 + export MSAMP_USE_WEIGHT_SIMULATE_FP4=1 + export MSAMP_USE_WEIGHT_DIFFERENTIABLE_GRADIENT_ESTIMATOR=1 + export MSAMP_USE_ACTIVATION_SIMULATE_FP4=1 + torchrun $DISTRIBUTED_ARGS ../third_party/Megatron-LM/pretrain_gpt.py \ + $GPT_ARGS \ + $DATA_ARGS \ + $OUTPUT_ARGS \ + --fp8-hybrid \ + --transformer-impl transformer_engine \ + --msamp \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH else echo $USAGE exit 1 diff --git a/gpt3/pretrain_6b7_megatron.sh b/gpt3/pretrain_6b7_megatron.sh index 3ba391c..df2f071 100644 --- a/gpt3/pretrain_6b7_megatron.sh +++ b/gpt3/pretrain_6b7_megatron.sh @@ -5,7 +5,7 @@ set -e -USAGE="usage: bash pretrain_6b7_megatron.sh [bf16|te|msamp]" +USAGE="usage: bash pretrain_6b7_megatron.sh [bf16|te|msamp|fp4]" if [ "$#" -ne 1 ]; then echo $USAGE @@ -115,6 +115,20 @@ elif [ "$FP_TYPE" = "msamp" ]; then --fp8-hybrid \ --transformer-impl transformer_engine \ --msamp +elif [ "$FP_TYPE" = "fp4" ]; then + CHECKPOINT_PATH=$PWD/checkpoints/gpt_6b7_fp4 + export MSAMP_USE_WEIGHT_SIMULATE_FP4=1 + export MSAMP_USE_WEIGHT_DIFFERENTIABLE_GRADIENT_ESTIMATOR=1 + export MSAMP_USE_ACTIVATION_SIMULATE_FP4=1 + torchrun $DISTRIBUTED_ARGS ../third_party/Megatron-LM/pretrain_gpt.py \ + $GPT_ARGS \ + $DATA_ARGS \ + $OUTPUT_ARGS \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --fp8-hybrid \ + --transformer-impl transformer_engine \ + --msamp else echo $USAGE exit 1