forked from allenai/open-instruct
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfinetune_flanv2_llama_7b.yaml
110 lines (106 loc) · 3.15 KB
/
finetune_flanv2_llama_7b.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
description: Finetune Llama 7b on flan v2
target:
service: sing
name: gcrprojvc1
# option to add key for apt: curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add -
# install fairseq requirements to docker image
# image is older version of pytorch because newer python version has threading error
environment:
image: nikghosh09/transformers:qlora_v1
registry: docker.io # any public registry can be specified here
code:
# local directory of the code. this will be uploaded to the server.
# $CONFIG_DIR is expanded to the directory of this config file
local_dir: $CONFIG_DIR
jobs:
- name: flan_v2_llama_7b
sku: G1
command:
- python open_instruct/prepare_subsampled_model.py
--pretrained_model_name_or_path /mnt/default/pretrained/llama-7b
--launcher accelerate
--tokenizer_name huggyllama/llama-7b
--use_fast_tokenizer False
--output_dir $$AMLT_OUTPUT_DIR/checkpoints
--logging_dir $$AMLT_OUTPUT_DIR/runs
--subsamp_ratio 1.0
--do_train
--do_eval
--report_to tensorboard
--logging_strategy steps
--logging_steps 10
--evaluation_strategy steps
--eval_steps 100
--num_training_eval_samples 1000
--num_train_epochs 2
--save_strategy steps
--save_steps 100
--save_total_limit 2
--eval_dataset_size 1000
--per_device_train_batch_size 4
--gradient_accumulation_steps 4
--per_device_eval_batch_size 16
--max_seq_length 512
--preprocessing_num_workers 16
--max_memory_MB 32000
--dataloader_num_workers 3
--use_lora
--lora_rank 64
--lora_alpha 16
--lora_dropout 0.05
--double_quant
--quant_type nf4
--bits 4
--lr_scheduler_type constant
--gradient_checkpointing
--train_file /mnt/default/data/flan_v2_data.jsonl
--ddp_find_unused_parameters False
--learning_rate 0.0002
--adam_beta2 0.999
--max_grad_norm 0.3
--weight_decay 0.0
- name: flan_v2_llama_7b_multi
sku: G8
command:
- python open_instruct/prepare_subsampled_model.py
--pretrained_model_name_or_path /mnt/default/pretrained/llama-7b
--tokenizer_name huggyllama/llama-7b
--use_fast_tokenizer False
--output_dir $$AMLT_OUTPUT_DIR/checkpoints
--logging_dir $$AMLT_OUTPUT_DIR/runs
--subsamp_ratio 1.0
--do_train
--do_eval
--report_to tensorboard
--logging_strategy steps
--logging_steps 10
--evaluation_strategy steps
--eval_steps 100
--num_training_eval_samples 1000
--num_train_epochs 2
--save_strategy steps
--save_steps 100
--save_total_limit 2
--eval_dataset_size 1000
--per_device_train_batch_size 2
--gradient_accumulation_steps 1
--per_device_eval_batch_size 16
--max_seq_length 512
--preprocessing_num_workers 16
--max_memory_MB 32000
--dataloader_num_workers 3
--use_lora
--lora_rank 64
--lora_alpha 16
--lora_dropout 0.05
--double_quant
--quant_type nf4
--bits 4
--lr_scheduler_type constant
--gradient_checkpointing
--train_file /mnt/default/data/flan_v2_data.jsonl
--ddp_find_unused_parameters False
--learning_rate 0.0002
--adam_beta2 0.999
--max_grad_norm 0.3
--weight_decay 0.0