Skip to content

Commit

Permalink
Fix convergence problem caused by deepspeed
Browse files Browse the repository at this point in the history
- Also add different settings of deepspeed for single-gpu and multi-gpu
  • Loading branch information
research4pan committed Mar 29, 2024
1 parent 716459c commit aabd071
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 4 deletions.
10 changes: 9 additions & 1 deletion scripts/run_finetune_with_lisa.sh
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,14 @@ project_dir=$(cd "$(dirname $0)"/..; pwd)
log_dir=${project_dir}/log/${exp_id}
mkdir -p ${output_dir} ${log_dir}

# Enable model parallelism for multiple gpus, modify this if you prefer
# customized deepspeed zero-redundancy optimization settings
num_gpu=$(python -c "import torch; print(torch.cuda.device_count())")
ds_config_file=configs/ds_config_zero0_no_offload.json
if [ ${num_gpu} -ge 2 ]; then
ds_config_file=configs/ds_config_zero2_no_offload.json
fi

deepspeed ${deepspeed_args} \
examples/finetune.py \
--model_name_or_path ${model_name_or_path} \
Expand All @@ -60,7 +68,7 @@ deepspeed ${deepspeed_args} \
--learning_rate 2e-5 \
--block_size 512 \
--per_device_train_batch_size 1 \
--deepspeed configs/ds_config_zero3_no_offload.json \
--deepspeed ${ds_config_file} \
--fp16 \
--run_name finetune \
--validation_split_percentage 0 \
Expand Down
5 changes: 2 additions & 3 deletions src/lmflow/pipeline/finetuner.py
Original file line number Diff line number Diff line change
Expand Up @@ -311,8 +311,7 @@ def __init__(self, n_layers, interval_steps, model):
self.layers_attribute = 'model.transformer.h' # General access path
self.total_layers = len(eval('self.' + self.layers_attribute)) # Dynamically execute to get the number of layers

# Freeze all layers upon initialization
self.freeze_all_layers()
self.switch_active_layers()
self.active_layers_indices = []

def freeze_all_layers(self):
Expand All @@ -323,7 +322,7 @@ def freeze_all_layers(self):

def on_step_begin(self, args, state, control, **kwargs):
# Check if it's time to switch active layers, including at step 0
if state.global_step % self.interval_steps == 0 or state.global_step == 1:
if state.global_step % self.interval_steps == 0:
self.switch_active_layers()

def switch_active_layers(self):
Expand Down

0 comments on commit aabd071

Please sign in to comment.