Skip to content

Commit

Permalink
fix qwen1.5-megatron and add 32b (#209)
Browse files Browse the repository at this point in the history
  • Loading branch information
lwmlyy authored Apr 26, 2024
1 parent a53fdb1 commit 0ab8902
Show file tree
Hide file tree
Showing 10 changed files with 238 additions and 49 deletions.
20 changes: 17 additions & 3 deletions examples/qwen1_5/run_evaluate_megatron_qwen.sh
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ MODEL_SIZE=$3
BATCH_SIZE=$4
SEQ_LEN=$5
PAD_LEN=$6
EXTRA_VOCAB_SIZE=$7
EXTRA_VOCAB_SIZE=$7 # 293 for models smaller than 32b, 421 for those larger
PR=$8
TP=$9
PP=${10}
Expand All @@ -40,6 +40,7 @@ TE=${15}
DATASET_PATH=${16}
PRETRAIN_CHECKPOINT_PATH=${17}

gqa_options=""
if [ $MODEL_SIZE = 0.5B ]; then

NUM_LAYERS=24
Expand Down Expand Up @@ -68,13 +69,26 @@ HIDDEN_SIZE=4096
NUM_ATTN_HEADS=32
INTERMEDIATE_SIZE=11008


elif [ $MODEL_SIZE = 14B ]; then

NUM_LAYERS=40
HIDDEN_SIZE=5120
NUM_ATTN_HEADS=40
INTERMEDIATE_SIZE=13696

elif [ $MODEL_SIZE = 32B ]; then

NUM_LAYERS=64
HIDDEN_SIZE=5120
NUM_ATTN_HEADS=40
INTERMEDIATE_SIZE=27392
MAX_POSITION_EMBEDDINGS=2048

gqa_options=" \
--group-query-attention \
--num-query-groups 8"

elif [ $MODEL_SIZE = 72B ]; then

NUM_LAYERS=80
Expand Down Expand Up @@ -176,15 +190,15 @@ megatron_options=" \
--patch-tokenizer-type LLamaTokenizer \
--swiglu \
--normalization RMSNorm \
--use-llama2-rotary-position-embeddings \
--use-rotary-position-embeddings \
--position-embedding-type rope \
--untie-embeddings-and-output-weights \
--rotary-base 1000000 \
--rotary-scale-factor 1 \
"

run_cmd="torchrun $DISTRIBUTED_ARGS ../llama2/evaluate_megatron_llama.py
${megatron_options} ${pr_options} ${load_options} ${te_options} ${activation_checkpoint_options} ${do_options} ${flash_options} ${sp_options}"
${megatron_options} ${pr_options} ${load_options} ${te_options} ${activation_checkpoint_options} ${do_options} ${flash_options} ${sp_options} ${gqa_options}"

echo ${run_cmd}
eval ${run_cmd}
Expand Down
3 changes: 1 addition & 2 deletions examples/qwen1_5/run_finetune_mcore_qwen_withGA.sh
Original file line number Diff line number Diff line change
Expand Up @@ -268,8 +268,7 @@ megatron_options=" \
--use-mcore-models \
--rotary-percent 1.0 \
--rotary-base 1000000 \
--rotary-seq-len-interpolation-factor 1 \
--eod-mask-loss
--rotary-seq-len-interpolation-factor 1
"

run_cmd="torchrun $DISTRIBUTED_ARGS pretrain_mcore_qwen.py
Expand Down
17 changes: 15 additions & 2 deletions examples/qwen1_5/run_finetune_megatron_qwen.sh
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ PRETRAIN_CHECKPOINT_PATH=${20}
EPOCH=${21}
OUTPUT_BASEPATH=${22}


gqa_options=""
if [ $MODEL_SIZE = 0.5B ]; then

NUM_LAYERS=24
Expand Down Expand Up @@ -81,13 +81,26 @@ HIDDEN_SIZE=5120
NUM_ATTN_HEADS=40
INTERMEDIATE_SIZE=13696

elif [ $MODEL_SIZE = 32B ]; then

NUM_LAYERS=64
HIDDEN_SIZE=5120
NUM_ATTN_HEADS=40
INTERMEDIATE_SIZE=27392


gqa_options=" \
--group-query-attention \
--num-query-groups 8"

elif [ $MODEL_SIZE = 72B ]; then

NUM_LAYERS=80
HIDDEN_SIZE=8192
NUM_ATTN_HEADS=64
INTERMEDIATE_SIZE=24576


fi

if [ $AC = full ]; then
Expand Down Expand Up @@ -220,7 +233,7 @@ megatron_options=" \
"

run_cmd="torchrun $DISTRIBUTED_ARGS pretrain_mcore_qwen.py
${megatron_options} ${pr_options} ${load_options} ${te_options} ${activation_checkpoint_options} ${do_options} ${flash_options} ${sp_options}"
${megatron_options} ${pr_options} ${load_options} ${te_options} ${activation_checkpoint_options} ${do_options} ${flash_options} ${sp_options} ${gqa_options}"

echo ${run_cmd}
eval ${run_cmd}
Expand Down
38 changes: 25 additions & 13 deletions examples/qwen1_5/run_finetune_megatron_qwen_withGA.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,12 @@ MEGATRON_PATH=${MEGATRON_PATCH_PATH}/Megatron-LM-231007
export PYTHONPATH=${MEGATRON_PATH}:${MEGATRON_PATCH_PATH}:$PYTHONPATH
export CUDA_DEVICE_MAX_CONNECTIONS=1
if [ $ENV = dsw ]; then
export CUDA_VISIBLE_DEVICES=7
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
MASTER_ADDR=localhost
MASTER_PORT=$(shuf -n 1 -i 10000-65535)
NNODES=1
NODE_RANK=0
GPUS_PER_NODE=1
GPUS_PER_NODE=8

elif [ $ENV = dlc ]; then

Expand All @@ -31,7 +31,7 @@ LR=$6
MIN_LR=$7
SEQ_LEN=$8
PAD_LEN=$9
EXTRA_VOCAB_SIZE=${10}
EXTRA_VOCAB_SIZE=${10} # 293 for models smaller than 32b, 421 for those larger
PR=${11}
TP=${12}
PP=${13}
Expand All @@ -54,47 +54,57 @@ NUM_LAYERS=24
HIDDEN_SIZE=1024
NUM_ATTN_HEADS=16
INTERMEDIATE_SIZE=2816
MAX_POSITION_EMBEDDINGS=32768


elif [ $MODEL_SIZE = 1.8B ]; then

NUM_LAYERS=24
HIDDEN_SIZE=2048
NUM_ATTN_HEADS=16
INTERMEDIATE_SIZE=5504
MAX_POSITION_EMBEDDINGS=32768


elif [ $MODEL_SIZE = 4B ]; then

NUM_LAYERS=40
HIDDEN_SIZE=2560
NUM_ATTN_HEADS=20
INTERMEDIATE_SIZE=6912
MAX_POSITION_EMBEDDINGS=32768


elif [ $MODEL_SIZE = 7B ]; then

NUM_LAYERS=32
HIDDEN_SIZE=4096
NUM_ATTN_HEADS=32
INTERMEDIATE_SIZE=11008
MAX_POSITION_EMBEDDINGS=32768


elif [ $MODEL_SIZE = 14B ]; then

NUM_LAYERS=40
HIDDEN_SIZE=5120
NUM_ATTN_HEADS=40
INTERMEDIATE_SIZE=13696
MAX_POSITION_EMBEDDINGS=32768

elif [ $MODEL_SIZE = 32B ]; then

NUM_LAYERS=64
HIDDEN_SIZE=5120
NUM_ATTN_HEADS=40
INTERMEDIATE_SIZE=27392

gqa_options=" \
--group-query-attention \
--num-query-groups 8"

elif [ $MODEL_SIZE = 72B ]; then

NUM_LAYERS=80
HIDDEN_SIZE=8192
NUM_ATTN_HEADS=64
INTERMEDIATE_SIZE=24576
MAX_POSITION_EMBEDDINGS=32768


fi

Expand Down Expand Up @@ -204,7 +214,7 @@ megatron_options=" \
--num-attention-heads ${NUM_ATTN_HEADS} \
--ffn-hidden-size ${INTERMEDIATE_SIZE} \
--seq-length ${SEQ_LEN} \
--max-position-embeddings ${MAX_POSITION_EMBEDDINGS} \
--max-position-embeddings ${SEQ_LEN} \
--max-padding-length ${PAD_LEN} \
--log-interval 1 \
--eval-interval 10000 \
Expand All @@ -227,13 +237,15 @@ megatron_options=" \
--patch-tokenizer-type LLamaTokenizer \
--swiglu \
--normalization RMSNorm \
--use-rotary-position-embeddings \
--use-llama2-rotary-position-embeddings \
--position-embedding-type rope \
--untie-embeddings-and-output-weights
--untie-embeddings-and-output-weights \
--rotary-base 1000000 \
--rotary-scale-factor 1 \
"

run_cmd="torchrun $DISTRIBUTED_ARGS ../llama2/finetune_megatron_llama_withGA.py
${megatron_options} ${pr_options} ${load_options} ${te_options} ${activation_checkpoint_options} ${do_options} ${flash_options} ${sp_options}"
${megatron_options} ${pr_options} ${load_options} ${te_options} ${activation_checkpoint_options} ${do_options} ${flash_options} ${sp_options} ${gqa_options}"

echo ${run_cmd}
eval ${run_cmd}
Expand Down
2 changes: 1 addition & 1 deletion examples/qwen1_5/run_pretrain_mcore_qwen.sh
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ LR=$6
MIN_LR=$7
SEQ_LEN=$8
PAD_LEN=$9
EXTRA_VOCAB_SIZE=${10}
EXTRA_VOCAB_SIZE=${10} # 293 for models smaller than 32b, 421 for those larger
PR=${11}
TP=${12}
PP=${13}
Expand Down
32 changes: 22 additions & 10 deletions examples/qwen1_5/run_pretrain_megatron_qwen.sh
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ LR=$6
MIN_LR=$7
SEQ_LEN=$8
PAD_LEN=$9
EXTRA_VOCAB_SIZE=${10}
EXTRA_VOCAB_SIZE=${10} # 293 for models smaller than 32b, 421 for those larger
PR=${11}
TP=${12}
PP=${13}
Expand All @@ -47,54 +47,66 @@ TRAIN_TOKENS=${22}
WARMUP_TOKENS=${23}
OUTPUT_BASEPATH=${24}


gqa_options=""
if [ $MODEL_SIZE = 0.5B ]; then

NUM_LAYERS=24
HIDDEN_SIZE=1024
NUM_ATTN_HEADS=16
INTERMEDIATE_SIZE=2816
MAX_POSITION_EMBEDDINGS=32768


elif [ $MODEL_SIZE = 1.8B ]; then

NUM_LAYERS=24
HIDDEN_SIZE=2048
NUM_ATTN_HEADS=16
INTERMEDIATE_SIZE=5504
MAX_POSITION_EMBEDDINGS=32768


elif [ $MODEL_SIZE = 4B ]; then

NUM_LAYERS=40
HIDDEN_SIZE=2560
NUM_ATTN_HEADS=20
INTERMEDIATE_SIZE=6912
MAX_POSITION_EMBEDDINGS=32768


elif [ $MODEL_SIZE = 7B ]; then

NUM_LAYERS=32
HIDDEN_SIZE=4096
NUM_ATTN_HEADS=32
INTERMEDIATE_SIZE=11008
MAX_POSITION_EMBEDDINGS=32768


elif [ $MODEL_SIZE = 14B ]; then

NUM_LAYERS=40
HIDDEN_SIZE=5120
NUM_ATTN_HEADS=40
INTERMEDIATE_SIZE=13696
MAX_POSITION_EMBEDDINGS=32768


elif [ $MODEL_SIZE = 32B ]; then

NUM_LAYERS=64
HIDDEN_SIZE=5120
NUM_ATTN_HEADS=40
INTERMEDIATE_SIZE=27392


gqa_options=" \
--group-query-attention \
--num-query-groups 8"

elif [ $MODEL_SIZE = 72B ]; then

NUM_LAYERS=80
HIDDEN_SIZE=8192
NUM_ATTN_HEADS=64
INTERMEDIATE_SIZE=24576
MAX_POSITION_EMBEDDINGS=32768


fi

Expand Down Expand Up @@ -202,7 +214,7 @@ megatron_options=" \
--num-attention-heads ${NUM_ATTN_HEADS} \
--ffn-hidden-size ${INTERMEDIATE_SIZE} \
--seq-length ${SEQ_LEN} \
--max-position-embeddings ${MAX_POSITION_EMBEDDINGS} \
--max-position-embeddings ${SEQ_LEN} \
--max-padding-length ${PAD_LEN} \
--log-interval 1 \
--eval-interval 10000 \
Expand Down Expand Up @@ -232,7 +244,7 @@ megatron_options=" \
"

run_cmd="torchrun $DISTRIBUTED_ARGS ../llama2/pretrain_megatron_llama.py
${megatron_options} ${pr_options} ${load_options} ${te_options} ${activation_checkpoint_options} ${do_options} ${flash_options} ${sp_options}"
${megatron_options} ${pr_options} ${load_options} ${te_options} ${activation_checkpoint_options} ${do_options} ${flash_options} ${sp_options} ${gqa_options}"

echo ${run_cmd}
eval ${run_cmd}
Expand Down
16 changes: 14 additions & 2 deletions examples/qwen1_5/run_text_generation_megatron_qwen.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ TP=$5
BS=$6
SEQ_LEN=$7
PAD_LEN=$8
EXTRA_VOCAB_SIZE=$9
EXTRA_VOCAB_SIZE=$9 # 293 for models smaller than 32b, 421 for those larger
PR=${10}
TOP_K=${11}
INPUT_SEQ_LEN=${12}
Expand All @@ -33,6 +33,7 @@ TEMPERATURE=${17}
# set this penalty between 1.1 and 1.5 to reduce repetition, default is 1.2
REPETITION_PENALTY=${18}

gqa_options=""
if [ $MODEL_SIZE = 0.5B ]; then

NUM_LAYERS=24
Expand Down Expand Up @@ -68,6 +69,17 @@ HIDDEN_SIZE=5120
NUM_ATTN_HEADS=40
INTERMEDIATE_SIZE=13696

elif [ $MODEL_SIZE = 32B ]; then

NUM_LAYERS=64
HIDDEN_SIZE=5120
NUM_ATTN_HEADS=40
INTERMEDIATE_SIZE=27392

gqa_options=" \
--group-query-attention \
--num-query-groups 8"

elif [ $MODEL_SIZE = 72B ]; then

NUM_LAYERS=80
Expand Down Expand Up @@ -132,7 +144,7 @@ rapidformer_options=" \
"

run_cmd="torchrun $DISTRIBUTED_ARGS ../llama2/generate_text_megatron_llama.py
${rapidformer_options} ${load_options} ${input_options} ${pr_options}"
${rapidformer_options} ${load_options} ${input_options} ${pr_options} ${gqa_options}"

echo ${run_cmd}
eval ${run_cmd}
Expand Down
Loading

0 comments on commit 0ab8902

Please sign in to comment.