Lora/PEFT training script CI test (NVIDIA#6664)

* new lora test Signed-off-by: arendu <[email protected]> * updates Signed-off-by: arendu <[email protected]> * check for chat Signed-off-by: arendu <[email protected]> * update Signed-off-by: arendu <[email protected]> * update Signed-off-by: arendu <[email protected]> * small train set Signed-off-by: arendu <[email protected]> * update Signed-off-by: arendu <[email protected]> * precision change Signed-off-by: arendu <[email protected]> * fixed typo in paths Signed-off-by: arendu <[email protected]> * full data with limit val batches Signed-off-by: arendu <[email protected]> * tp2 instead of pp2 Signed-off-by: arendu <[email protected]> * tp2 instead of pp2 Signed-off-by: arendu <[email protected]> --------- Signed-off-by: arendu <[email protected]> Signed-off-by: Adi Renduchintala <[email protected]>
CGranger-sorenson · Jun 5, 2023 · ae00414 · ae00414
1 parent 5c3ed94
commit ae00414
Showing 1 changed file with 35 additions and 0 deletions.
diff --git a/Jenkinsfile b/Jenkinsfile
@@ -3381,6 +3381,41 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
         sh "rm -rf examples/nlp/language_modeling/gpt_sft_results"
       }
     }
+    stage('L2: Megatron GPT PEFT Lora TP=2') {
+      when {
+        anyOf {
+          branch 'main'
+          changeRequest target: 'main'
+        }
+      }
+      failFast true
+      steps {
+        sh "rm -rf examples/nlp/language_modeling/gpt_peft_lora_results"
+        sh "python examples/nlp/language_modeling/tuning/megatron_gpt_peft_tuning.py \
+        trainer.devices=2 \
+        trainer.log_every_n_steps=1 \
+        trainer.max_epochs=9999 \
+        trainer.max_steps=3 \
+        trainer.val_check_interval=3 \
+        ++trainer.limit_val_batches=2 \
+        trainer.precision=16 \
+        exp_manager.exp_dir=examples/nlp/language_modeling/gpt_peft_lora_results \
+        model.pipeline_model_parallel_size=1 \
+        model.tensor_model_parallel_size=2 \
+        model.restore_from_path=/home/TestData/nlp/megatron_gpt/TP2/megatron_gpt_tp2.nemo \
+        model.peft.peft_scheme='lora' \
+        model.answer_only_loss=True \
+        model.micro_batch_size=1 \
+        model.global_batch_size=4 \
+        model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
+        model.data.train_ds.concat_sampling_probabilities=[1.0] \
+        model.data.train_ds.num_workers=0 \
+        model.data.validation_ds.num_workers=0 \
+        model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
+        model.data.validation_ds.names=[quarel]"
+        sh "rm -rf examples/nlp/language_modeling/gpt_peft_lora_results"
+      }
+    }
     stage('L2: Megatron GPT Eval') {
       when {
         anyOf {