From 09a52ab1ddf4285d2dc6953efaa149e6c934b7d3 Mon Sep 17 00:00:00 2001 From: yizhenjia Date: Fri, 25 Oct 2024 09:10:10 +0800 Subject: [PATCH] [usability] debug tools dev --- src/lmflow/pipeline/finetuner.py | 15 +++++++++++---- src/lmflow/utils/debug/constants.py | 12 ++++++++++++ src/lmflow/utils/{ => debug}/debug.py | 0 3 files changed, 23 insertions(+), 4 deletions(-) create mode 100644 src/lmflow/utils/debug/constants.py rename src/lmflow/utils/{ => debug}/debug.py (100%) diff --git a/src/lmflow/pipeline/finetuner.py b/src/lmflow/pipeline/finetuner.py index 6daa7259f..e786acf7e 100644 --- a/src/lmflow/pipeline/finetuner.py +++ b/src/lmflow/pipeline/finetuner.py @@ -38,7 +38,7 @@ from lmflow.datasets.dataset import Dataset from lmflow.pipeline.base_tuner import BaseTuner from lmflow.pipeline.utils.peft_trainer import PeftTrainer, PeftSavingCallback -from lmflow.utils.debug import get_parameter_names_in_param_groups +from lmflow.utils.debug.debug import get_parameter_names_in_param_groups logger = logging.getLogger(__name__) @@ -550,7 +550,10 @@ def on_step_begin(self, args, state, control, **kwargs): if state.global_step % self.interval_steps == 0: self.switch_active_layers() - # layers = eval('self.' + self.layers_attribute) # Re-fetch layer references + layers = eval('self.' + self.layers_attribute) # Re-fetch layer references + print(f'>>> on step {state.global_step} begin model params') + print(layers[self.active_layers_indices[0]].attn.c_attn.weight) + print(f'<<< on step {state.global_step} begin model params') # self.previous_params = { # name: param.clone().detach() # for name, param in layers[self.active_layers_indices[0]].named_parameters() @@ -563,6 +566,7 @@ def switch_active_layers(self): # Randomly select n_layers to activate layers = eval('self.' + self.layers_attribute) # Re-fetch layer references self.active_layers_indices = np.random.choice(range(self.total_layers), self.n_layers, replace=False) + self.active_layers_indices.sort() print(f"Activating layers at indices: {self.active_layers_indices} for the next steps.", flush=True) # Enable gradients only for the selected layers @@ -571,13 +575,16 @@ def switch_active_layers(self): param.requires_grad = True def on_step_end(self, args, state, control, **kwargs): - # layers = eval('self.' + self.layers_attribute) # Re-fetch layer references + layers = eval('self.' + self.layers_attribute) # Re-fetch layer references # for name, param in layers[self.active_layers_indices[0]].named_parameters(): # if torch.equal(param, self.previous_params[name]): # print(f"No change in parameter: {name}") # else: # print(f"Parameter updated: {name}") - pass + print(f'>>> on step {state.global_step-1} end model params') + print(layers[self.active_layers_indices[0]].attn.c_attn.weight.shape) + print(layers[self.active_layers_indices[0]].attn.c_attn.weight) + print(f'<<< on step {state.global_step-1} end model params') def on_optimizer_step(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs): pass diff --git a/src/lmflow/utils/debug/constants.py b/src/lmflow/utils/debug/constants.py new file mode 100644 index 000000000..21280fb6b --- /dev/null +++ b/src/lmflow/utils/debug/constants.py @@ -0,0 +1,12 @@ +GPT2= { + "param_name_in_group": [ + {'parameter_names': ['transformer.wte.weight', 'transformer.wpe.weight', 'transformer.h.0.attn.c_attn.weight', 'transformer.h.0.attn.c_proj.weight', 'transformer.h.0.mlp.c_fc.weight', 'transformer.h.0.mlp.c_proj.weight', 'transformer.h.1.attn.c_attn.weight', 'transformer.h.1.attn.c_proj.weight', 'transformer.h.1.mlp.c_fc.weight', 'transformer.h.1.mlp.c_proj.weight', 'transformer.h.2.attn.c_attn.weight', 'transformer.h.2.attn.c_proj.weight', 'transformer.h.2.mlp.c_fc.weight', 'transformer.h.2.mlp.c_proj.weight', 'transformer.h.3.attn.c_attn.weight', 'transformer.h.3.attn.c_proj.weight', 'transformer.h.3.mlp.c_fc.weight', 'transformer.h.3.mlp.c_proj.weight', 'transformer.h.4.attn.c_attn.weight', 'transformer.h.4.attn.c_proj.weight', 'transformer.h.4.mlp.c_fc.weight', 'transformer.h.4.mlp.c_proj.weight', 'transformer.h.5.attn.c_attn.weight', 'transformer.h.5.attn.c_proj.weight', 'transformer.h.5.mlp.c_fc.weight', 'transformer.h.5.mlp.c_proj.weight', 'transformer.h.6.attn.c_attn.weight', 'transformer.h.6.attn.c_proj.weight', 'transformer.h.6.mlp.c_fc.weight', 'transformer.h.6.mlp.c_proj.weight', 'transformer.h.7.attn.c_attn.weight', 'transformer.h.7.attn.c_proj.weight', 'transformer.h.7.mlp.c_fc.weight', 'transformer.h.7.mlp.c_proj.weight', 'transformer.h.8.attn.c_attn.weight', 'transformer.h.8.attn.c_proj.weight', 'transformer.h.8.mlp.c_fc.weight', 'transformer.h.8.mlp.c_proj.weight', 'transformer.h.9.attn.c_attn.weight', 'transformer.h.9.attn.c_proj.weight', 'transformer.h.9.mlp.c_fc.weight', 'transformer.h.9.mlp.c_proj.weight', 'transformer.h.10.attn.c_attn.weight', 'transformer.h.10.attn.c_proj.weight', 'transformer.h.10.mlp.c_fc.weight', 'transformer.h.10.mlp.c_proj.weight', 'transformer.h.11.attn.c_attn.weight', 'transformer.h.11.attn.c_proj.weight', 'transformer.h.11.mlp.c_fc.weight', 'transformer.h.11.mlp.c_proj.weight']}, + {'parameter_names': ['transformer.h.0.ln_1.weight', 'transformer.h.0.ln_1.bias', 'transformer.h.0.attn.c_attn.bias', 'transformer.h.0.attn.c_proj.bias', 'transformer.h.0.ln_2.weight', 'transformer.h.0.ln_2.bias', 'transformer.h.0.mlp.c_fc.bias', 'transformer.h.0.mlp.c_proj.bias', 'transformer.h.1.ln_1.weight', 'transformer.h.1.ln_1.bias', 'transformer.h.1.attn.c_attn.bias', 'transformer.h.1.attn.c_proj.bias', 'transformer.h.1.ln_2.weight', 'transformer.h.1.ln_2.bias', 'transformer.h.1.mlp.c_fc.bias', 'transformer.h.1.mlp.c_proj.bias', 'transformer.h.2.ln_1.weight', 'transformer.h.2.ln_1.bias', 'transformer.h.2.attn.c_attn.bias', 'transformer.h.2.attn.c_proj.bias', 'transformer.h.2.ln_2.weight', 'transformer.h.2.ln_2.bias', 'transformer.h.2.mlp.c_fc.bias', 'transformer.h.2.mlp.c_proj.bias', 'transformer.h.3.ln_1.weight', 'transformer.h.3.ln_1.bias', 'transformer.h.3.attn.c_attn.bias', 'transformer.h.3.attn.c_proj.bias', 'transformer.h.3.ln_2.weight', 'transformer.h.3.ln_2.bias', 'transformer.h.3.mlp.c_fc.bias', 'transformer.h.3.mlp.c_proj.bias', 'transformer.h.4.ln_1.weight', 'transformer.h.4.ln_1.bias', 'transformer.h.4.attn.c_attn.bias', 'transformer.h.4.attn.c_proj.bias', 'transformer.h.4.ln_2.weight', 'transformer.h.4.ln_2.bias', 'transformer.h.4.mlp.c_fc.bias', 'transformer.h.4.mlp.c_proj.bias', 'transformer.h.5.ln_1.weight', 'transformer.h.5.ln_1.bias', 'transformer.h.5.attn.c_attn.bias', 'transformer.h.5.attn.c_proj.bias', 'transformer.h.5.ln_2.weight', 'transformer.h.5.ln_2.bias', 'transformer.h.5.mlp.c_fc.bias', 'transformer.h.5.mlp.c_proj.bias', 'transformer.h.6.ln_1.weight', 'transformer.h.6.ln_1.bias', 'transformer.h.6.attn.c_attn.bias', 'transformer.h.6.attn.c_proj.bias', 'transformer.h.6.ln_2.weight', 'transformer.h.6.ln_2.bias', 'transformer.h.6.mlp.c_fc.bias', 'transformer.h.6.mlp.c_proj.bias', 'transformer.h.7.ln_1.weight', 'transformer.h.7.ln_1.bias', 'transformer.h.7.attn.c_attn.bias', 'transformer.h.7.attn.c_proj.bias', 'transformer.h.7.ln_2.weight', 'transformer.h.7.ln_2.bias', 'transformer.h.7.mlp.c_fc.bias', 'transformer.h.7.mlp.c_proj.bias', 'transformer.h.8.ln_1.weight', 'transformer.h.8.ln_1.bias', 'transformer.h.8.attn.c_attn.bias', 'transformer.h.8.attn.c_proj.bias', 'transformer.h.8.ln_2.weight', 'transformer.h.8.ln_2.bias', 'transformer.h.8.mlp.c_fc.bias', 'transformer.h.8.mlp.c_proj.bias', 'transformer.h.9.ln_1.weight', 'transformer.h.9.ln_1.bias', 'transformer.h.9.attn.c_attn.bias', 'transformer.h.9.attn.c_proj.bias', 'transformer.h.9.ln_2.weight', 'transformer.h.9.ln_2.bias', 'transformer.h.9.mlp.c_fc.bias', 'transformer.h.9.mlp.c_proj.bias', 'transformer.h.10.ln_1.weight', 'transformer.h.10.ln_1.bias', 'transformer.h.10.attn.c_attn.bias', 'transformer.h.10.attn.c_proj.bias', 'transformer.h.10.ln_2.weight', 'transformer.h.10.ln_2.bias', 'transformer.h.10.mlp.c_fc.bias', 'transformer.h.10.mlp.c_proj.bias', 'transformer.h.11.ln_1.weight', 'transformer.h.11.ln_1.bias', 'transformer.h.11.attn.c_attn.bias', 'transformer.h.11.attn.c_proj.bias', 'transformer.h.11.ln_2.weight', 'transformer.h.11.ln_2.bias', 'transformer.h.11.mlp.c_fc.bias', 'transformer.h.11.mlp.c_proj.bias', 'transformer.ln_f.weight', 'transformer.ln_f.bias']} + ], + "num_params": { + "lm_head": 50257*768 + 1024*768, # wte, wpe + "gpt2block": 768*2304 + 768*768 + 768*3072 + 768*3072 + 6*768 + 2304 + 3072, + "gpt2block_in_pg0": 768*2304 + 768*768 + 768*3072 + 768*3072, # weight decay (if any) + "gpt2block_in_pg1": 768 + 768 + 2304 + 768 + 768 + 768 + 3072 + 768, # no weight decay (no matter what) ln_1.weight: 768 ln_1.bias: 768 attn.c_attn.bias: 2304 attn.c_proj.bias: 768 ln_2.weight: 768 ln_2.bias: 768 mlp.c_fc.bias: 3072 mlp.c_proj.bias: 768 + } +} \ No newline at end of file diff --git a/src/lmflow/utils/debug.py b/src/lmflow/utils/debug/debug.py similarity index 100% rename from src/lmflow/utils/debug.py rename to src/lmflow/utils/debug/debug.py