fix qwen1.5-megatron and add 32b (#209)

alibaba · Apr 26, 2024 · 0ab8902 · 0ab8902
1 parent a53fdb1
commit 0ab8902
Show file tree

Hide file tree

Showing 10 changed files with 238 additions and 49 deletions.
diff --git a/examples/qwen1_5/run_evaluate_megatron_qwen.sh b/examples/qwen1_5/run_evaluate_megatron_qwen.sh
@@ -28,7 +28,7 @@ MODEL_SIZE=$3
 BATCH_SIZE=$4
 SEQ_LEN=$5
 PAD_LEN=$6
-EXTRA_VOCAB_SIZE=$7
+EXTRA_VOCAB_SIZE=$7 # 293 for models smaller than 32b, 421 for those larger
 PR=$8
 TP=$9
 PP=${10}
@@ -40,6 +40,7 @@ TE=${15}
 DATASET_PATH=${16}
 PRETRAIN_CHECKPOINT_PATH=${17}
 
+gqa_options=""
 if [ $MODEL_SIZE = 0.5B ]; then
 
 NUM_LAYERS=24
@@ -68,13 +69,26 @@ HIDDEN_SIZE=4096
 NUM_ATTN_HEADS=32
 INTERMEDIATE_SIZE=11008
 
+
 elif [ $MODEL_SIZE = 14B ]; then
 
 NUM_LAYERS=40
 HIDDEN_SIZE=5120
 NUM_ATTN_HEADS=40
 INTERMEDIATE_SIZE=13696
 
+elif [ $MODEL_SIZE = 32B ]; then
+
+NUM_LAYERS=64
+HIDDEN_SIZE=5120
+NUM_ATTN_HEADS=40
+INTERMEDIATE_SIZE=27392
+MAX_POSITION_EMBEDDINGS=2048
+
+gqa_options=" \
+		    --group-query-attention \
+		    --num-query-groups 8"
+
 elif [ $MODEL_SIZE = 72B ]; then
 
 NUM_LAYERS=80
@@ -176,15 +190,15 @@ megatron_options=" \
         --patch-tokenizer-type LLamaTokenizer \
         --swiglu \
         --normalization RMSNorm \
-        --use-llama2-rotary-position-embeddings \
+        --use-rotary-position-embeddings \
         --position-embedding-type rope \
         --untie-embeddings-and-output-weights \
         --rotary-base 1000000 \
         --rotary-scale-factor 1 \
         "
 
 run_cmd="torchrun $DISTRIBUTED_ARGS ../llama2/evaluate_megatron_llama.py
- ${megatron_options} ${pr_options} ${load_options} ${te_options} ${activation_checkpoint_options} ${do_options} ${flash_options} ${sp_options}"
+ ${megatron_options} ${pr_options} ${load_options} ${te_options} ${activation_checkpoint_options} ${do_options} ${flash_options} ${sp_options} ${gqa_options}"
 
 echo ${run_cmd}
 eval ${run_cmd}

diff --git a/examples/qwen1_5/run_finetune_mcore_qwen_withGA.sh b/examples/qwen1_5/run_finetune_mcore_qwen_withGA.sh
@@ -268,8 +268,7 @@ megatron_options="  \
         --use-mcore-models \
         --rotary-percent 1.0 \
         --rotary-base 1000000 \
-        --rotary-seq-len-interpolation-factor 1 \
-        --eod-mask-loss
+        --rotary-seq-len-interpolation-factor 1
         "
 
 run_cmd="torchrun $DISTRIBUTED_ARGS pretrain_mcore_qwen.py

diff --git a/examples/qwen1_5/run_finetune_megatron_qwen.sh b/examples/qwen1_5/run_finetune_megatron_qwen.sh
@@ -45,7 +45,7 @@ PRETRAIN_CHECKPOINT_PATH=${20}
 EPOCH=${21}
 OUTPUT_BASEPATH=${22}
 
-
+gqa_options=""
 if [ $MODEL_SIZE = 0.5B ]; then
 
 NUM_LAYERS=24
@@ -81,13 +81,26 @@ HIDDEN_SIZE=5120
 NUM_ATTN_HEADS=40
 INTERMEDIATE_SIZE=13696
 
+elif [ $MODEL_SIZE = 32B ]; then
+
+NUM_LAYERS=64
+HIDDEN_SIZE=5120
+NUM_ATTN_HEADS=40
+INTERMEDIATE_SIZE=27392
+
+
+gqa_options=" \
+		    --group-query-attention \
+		    --num-query-groups 8"
+
 elif [ $MODEL_SIZE = 72B ]; then
 
 NUM_LAYERS=80
 HIDDEN_SIZE=8192
 NUM_ATTN_HEADS=64
 INTERMEDIATE_SIZE=24576
 
+
 fi
 
 if [ $AC = full ]; then
@@ -220,7 +233,7 @@ megatron_options="  \
         "
 
 run_cmd="torchrun $DISTRIBUTED_ARGS pretrain_mcore_qwen.py
- ${megatron_options} ${pr_options} ${load_options} ${te_options} ${activation_checkpoint_options} ${do_options} ${flash_options} ${sp_options}"
+ ${megatron_options} ${pr_options} ${load_options} ${te_options} ${activation_checkpoint_options} ${do_options} ${flash_options} ${sp_options} ${gqa_options}"
 
 echo ${run_cmd}
 eval ${run_cmd}

diff --git a/examples/qwen1_5/run_finetune_megatron_qwen_withGA.sh b/examples/qwen1_5/run_finetune_megatron_qwen_withGA.sh
@@ -7,12 +7,12 @@ MEGATRON_PATH=${MEGATRON_PATCH_PATH}/Megatron-LM-231007
 export PYTHONPATH=${MEGATRON_PATH}:${MEGATRON_PATCH_PATH}:$PYTHONPATH
 export CUDA_DEVICE_MAX_CONNECTIONS=1
 if [ $ENV = dsw ]; then
-export CUDA_VISIBLE_DEVICES=7
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
 MASTER_ADDR=localhost
 MASTER_PORT=$(shuf -n 1 -i 10000-65535)
 NNODES=1
 NODE_RANK=0
-GPUS_PER_NODE=1
+GPUS_PER_NODE=8
 
 elif [ $ENV = dlc ]; then
 
@@ -31,7 +31,7 @@ LR=$6
 MIN_LR=$7
 SEQ_LEN=$8
 PAD_LEN=$9
-EXTRA_VOCAB_SIZE=${10}
+EXTRA_VOCAB_SIZE=${10} # 293 for models smaller than 32b, 421 for those larger
 PR=${11}
 TP=${12}
 PP=${13}
@@ -54,47 +54,57 @@ NUM_LAYERS=24
 HIDDEN_SIZE=1024
 NUM_ATTN_HEADS=16
 INTERMEDIATE_SIZE=2816
-MAX_POSITION_EMBEDDINGS=32768
+
 
 elif [ $MODEL_SIZE = 1.8B ]; then
 
 NUM_LAYERS=24
 HIDDEN_SIZE=2048
 NUM_ATTN_HEADS=16
 INTERMEDIATE_SIZE=5504
-MAX_POSITION_EMBEDDINGS=32768
+
 
 elif [ $MODEL_SIZE = 4B ]; then
 
 NUM_LAYERS=40
 HIDDEN_SIZE=2560
 NUM_ATTN_HEADS=20
 INTERMEDIATE_SIZE=6912
-MAX_POSITION_EMBEDDINGS=32768
+
 
 elif [ $MODEL_SIZE = 7B ]; then
 
 NUM_LAYERS=32
 HIDDEN_SIZE=4096
 NUM_ATTN_HEADS=32
 INTERMEDIATE_SIZE=11008
-MAX_POSITION_EMBEDDINGS=32768
+
 
 elif [ $MODEL_SIZE = 14B ]; then
 
 NUM_LAYERS=40
 HIDDEN_SIZE=5120
 NUM_ATTN_HEADS=40
 INTERMEDIATE_SIZE=13696
-MAX_POSITION_EMBEDDINGS=32768
+
+elif [ $MODEL_SIZE = 32B ]; then
+
+NUM_LAYERS=64
+HIDDEN_SIZE=5120
+NUM_ATTN_HEADS=40
+INTERMEDIATE_SIZE=27392
+
+gqa_options=" \
+		    --group-query-attention \
+		    --num-query-groups 8"
 
 elif [ $MODEL_SIZE = 72B ]; then
 
 NUM_LAYERS=80
 HIDDEN_SIZE=8192
 NUM_ATTN_HEADS=64
 INTERMEDIATE_SIZE=24576
-MAX_POSITION_EMBEDDINGS=32768
+
 
 fi
 
@@ -204,7 +214,7 @@ megatron_options="  \
         --num-attention-heads ${NUM_ATTN_HEADS} \
         --ffn-hidden-size ${INTERMEDIATE_SIZE} \
         --seq-length ${SEQ_LEN} \
-        --max-position-embeddings ${MAX_POSITION_EMBEDDINGS} \
+        --max-position-embeddings ${SEQ_LEN} \
         --max-padding-length ${PAD_LEN} \
         --log-interval 1 \
         --eval-interval 10000 \
@@ -227,13 +237,15 @@ megatron_options="  \
         --patch-tokenizer-type LLamaTokenizer \
         --swiglu \
         --normalization RMSNorm \
-        --use-rotary-position-embeddings \
+        --use-llama2-rotary-position-embeddings \
         --position-embedding-type rope \
-        --untie-embeddings-and-output-weights
+        --untie-embeddings-and-output-weights \
+        --rotary-base 1000000 \
+        --rotary-scale-factor 1 \
         "
 
 run_cmd="torchrun $DISTRIBUTED_ARGS ../llama2/finetune_megatron_llama_withGA.py
- ${megatron_options} ${pr_options} ${load_options} ${te_options} ${activation_checkpoint_options} ${do_options} ${flash_options} ${sp_options}"
+ ${megatron_options} ${pr_options} ${load_options} ${te_options} ${activation_checkpoint_options} ${do_options} ${flash_options} ${sp_options} ${gqa_options}"
 
 echo ${run_cmd}
 eval ${run_cmd}

diff --git a/examples/qwen1_5/run_pretrain_mcore_qwen.sh b/examples/qwen1_5/run_pretrain_mcore_qwen.sh
@@ -30,7 +30,7 @@ LR=$6
 MIN_LR=$7
 SEQ_LEN=$8
 PAD_LEN=$9
-EXTRA_VOCAB_SIZE=${10}
+EXTRA_VOCAB_SIZE=${10} # 293 for models smaller than 32b, 421 for those larger
 PR=${11}
 TP=${12}
 PP=${13}

diff --git a/examples/qwen1_5/run_pretrain_megatron_qwen.sh b/examples/qwen1_5/run_pretrain_megatron_qwen.sh
@@ -31,7 +31,7 @@ LR=$6
 MIN_LR=$7
 SEQ_LEN=$8
 PAD_LEN=$9
-EXTRA_VOCAB_SIZE=${10}
+EXTRA_VOCAB_SIZE=${10} # 293 for models smaller than 32b, 421 for those larger
 PR=${11}
 TP=${12}
 PP=${13}
@@ -47,54 +47,66 @@ TRAIN_TOKENS=${22}
 WARMUP_TOKENS=${23}
 OUTPUT_BASEPATH=${24}
 
-
+gqa_options=""
 if [ $MODEL_SIZE = 0.5B ]; then
 
 NUM_LAYERS=24
 HIDDEN_SIZE=1024
 NUM_ATTN_HEADS=16
 INTERMEDIATE_SIZE=2816
-MAX_POSITION_EMBEDDINGS=32768
+
 
 elif [ $MODEL_SIZE = 1.8B ]; then
 
 NUM_LAYERS=24
 HIDDEN_SIZE=2048
 NUM_ATTN_HEADS=16
 INTERMEDIATE_SIZE=5504
-MAX_POSITION_EMBEDDINGS=32768
+
 
 elif [ $MODEL_SIZE = 4B ]; then
 
 NUM_LAYERS=40
 HIDDEN_SIZE=2560
 NUM_ATTN_HEADS=20
 INTERMEDIATE_SIZE=6912
-MAX_POSITION_EMBEDDINGS=32768
+
 
 elif [ $MODEL_SIZE = 7B ]; then
 
 NUM_LAYERS=32
 HIDDEN_SIZE=4096
 NUM_ATTN_HEADS=32
 INTERMEDIATE_SIZE=11008
-MAX_POSITION_EMBEDDINGS=32768
+
 
 elif [ $MODEL_SIZE = 14B ]; then
 
 NUM_LAYERS=40
 HIDDEN_SIZE=5120
 NUM_ATTN_HEADS=40
 INTERMEDIATE_SIZE=13696
-MAX_POSITION_EMBEDDINGS=32768
+
+
+elif [ $MODEL_SIZE = 32B ]; then
+
+NUM_LAYERS=64
+HIDDEN_SIZE=5120
+NUM_ATTN_HEADS=40
+INTERMEDIATE_SIZE=27392
+
+
+gqa_options=" \
+		    --group-query-attention \
+		    --num-query-groups 8"
 
 elif [ $MODEL_SIZE = 72B ]; then
 
 NUM_LAYERS=80
 HIDDEN_SIZE=8192
 NUM_ATTN_HEADS=64
 INTERMEDIATE_SIZE=24576
-MAX_POSITION_EMBEDDINGS=32768
+
 
 fi
 
@@ -202,7 +214,7 @@ megatron_options="  \
         --num-attention-heads ${NUM_ATTN_HEADS} \
         --ffn-hidden-size ${INTERMEDIATE_SIZE} \
         --seq-length ${SEQ_LEN} \
-        --max-position-embeddings ${MAX_POSITION_EMBEDDINGS} \
+        --max-position-embeddings ${SEQ_LEN} \
         --max-padding-length ${PAD_LEN} \
         --log-interval 1 \
         --eval-interval 10000 \
@@ -232,7 +244,7 @@ megatron_options="  \
         "
 
 run_cmd="torchrun $DISTRIBUTED_ARGS ../llama2/pretrain_megatron_llama.py
- ${megatron_options} ${pr_options} ${load_options} ${te_options} ${activation_checkpoint_options} ${do_options} ${flash_options} ${sp_options}"
+ ${megatron_options} ${pr_options} ${load_options} ${te_options} ${activation_checkpoint_options} ${do_options} ${flash_options} ${sp_options} ${gqa_options}"
 
 echo ${run_cmd}
 eval ${run_cmd}

diff --git a/examples/qwen1_5/run_text_generation_megatron_qwen.sh b/examples/qwen1_5/run_text_generation_megatron_qwen.sh
@@ -21,7 +21,7 @@ TP=$5
 BS=$6
 SEQ_LEN=$7
 PAD_LEN=$8
-EXTRA_VOCAB_SIZE=$9
+EXTRA_VOCAB_SIZE=$9 # 293 for models smaller than 32b, 421 for those larger
 PR=${10}
 TOP_K=${11}
 INPUT_SEQ_LEN=${12}
@@ -33,6 +33,7 @@ TEMPERATURE=${17}
 # set this penalty between 1.1 and 1.5 to reduce repetition, default is 1.2
 REPETITION_PENALTY=${18}
 
+gqa_options=""
 if [ $MODEL_SIZE = 0.5B ]; then
 
 NUM_LAYERS=24
@@ -68,6 +69,17 @@ HIDDEN_SIZE=5120
 NUM_ATTN_HEADS=40
 INTERMEDIATE_SIZE=13696
 
+elif [ $MODEL_SIZE = 32B ]; then
+
+NUM_LAYERS=64
+HIDDEN_SIZE=5120
+NUM_ATTN_HEADS=40
+INTERMEDIATE_SIZE=27392
+
+gqa_options=" \
+		    --group-query-attention \
+		    --num-query-groups 8"
+
 elif [ $MODEL_SIZE = 72B ]; then
 
 NUM_LAYERS=80
@@ -132,7 +144,7 @@ rapidformer_options="  \
     "
 
 run_cmd="torchrun $DISTRIBUTED_ARGS ../llama2/generate_text_megatron_llama.py
- ${rapidformer_options} ${load_options} ${input_options} ${pr_options}"
+ ${rapidformer_options} ${load_options} ${input_options} ${pr_options} ${gqa_options}"
 
 echo ${run_cmd}
 eval ${run_cmd}