-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrain_llm.py
155 lines (132 loc) · 4.38 KB
/
train_llm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
import os
from datasets import load_dataset
from peft import LoraConfig
import torch
from trl import SFTTrainer
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig
#############Training Parameters############################
local_rank = -1
per_device_train_batch_size = 4
per_device_eval_batch_size = 4
gradient_accumulation_steps = 1
max_grad_norm = 0.3
weight_decay = 0.001
lora_alpha = 16
lora_dropout = 0.1
lora_r = 64
max_seq_length = None
learning_rate = 2e-4
use_4bit = True
use_nested_quant = False
bnb_4bit_compute_dtype = "float16"
bnb_4bit_quant_type = "nf4"
num_train_epochs = 5
fp16 = False
bf16 = False
packing = False
optim = "paged_adamw_32bit"
lr_scheduler_type = "cosine"
max_steps = -1
warmup_ratio = 0.03
group_by_length = True
save_steps = 5000
logging_steps = 10
output_dir = "./results"
device_map = {"": 0}
report_to = "tensorboard"
tb_log_dir = "./results/logs"
# model_name = "microsoft/phi-2"
model_name = "microsoft/Phi-3-mini-4k-instruct"
model_adapter = "adapter"
dataset_name = "/home/kalyan/Documents/LLM/prompt.jsonl"
def load_model(model_name):
# Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
bnb_config = BitsAndBytesConfig(
load_in_4bit=use_4bit,
bnb_4bit_quant_type=bnb_4bit_quant_type,
bnb_4bit_compute_dtype=compute_dtype,
bnb_4bit_use_double_quant=use_nested_quant,
)
if compute_dtype == torch.float16 and use_4bit:
major, _ = torch.cuda.get_device_capability()
if major >= 8:
print("=" * 80)
print("Your GPU supports bfloat16, you can accelerate training with the argument --bf16")
print("=" * 80)
model = AutoModelForCausalLM.from_pretrained(
model_name,
device_map=device_map,
quantization_config=bnb_config,
trust_remote_code=True,
)
model.config.use_cache = False
model.config.pretraining_tp = 1
# Load LoRA configuration
peft_config = LoraConfig(
lora_alpha=lora_alpha,
lora_dropout=lora_dropout,
target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
"fc1", "fc2",
"gate_proj", "up_proj", "down_proj"],
# target_modules=["q_proj","v_proj"],
r=lora_r,
bias="none",
task_type="CAUSAL_LM",
)
# Load Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
return model, tokenizer, peft_config
model, tokenizer, peft_config = load_model(model_name)
def format_dolly(sample):
instruction = f"<s>[INST] {sample['instruction']}"
# context = f"Here's is some context: {sample['context']}" if len(sample["context"]) > 0 else None
response = f" [/INST] {sample['response']}"
# join all the parts together
prompt = "".join([i for i in [instruction, response] if i is not None])
# prompt = "".join([i for i in [instruction, context, response] if i is not None])
return prompt
# template dataset to add prompt to each sample
def template_dataset(sample):
sample["text"] = f"{format_dolly(sample)}{tokenizer.eos_token}"
return sample
dataset = load_dataset("json", data_files=dataset_name, split="train")
dataset = dataset.map(template_dataset, remove_columns=list(dataset.features))
print(dataset)
# Set training parameters
training_arguments = TrainingArguments(
output_dir=output_dir,
num_train_epochs=num_train_epochs,
per_device_train_batch_size=per_device_train_batch_size,
gradient_accumulation_steps=gradient_accumulation_steps,
optim=optim,
save_steps=save_steps,
logging_steps=logging_steps,
learning_rate=learning_rate,
weight_decay=weight_decay,
fp16=fp16,
bf16=bf16,
max_grad_norm=max_grad_norm,
max_steps=max_steps,
warmup_ratio=warmup_ratio,
group_by_length=group_by_length,
lr_scheduler_type=lr_scheduler_type,
report_to="tensorboard"
)
# Set supervised fine-tuning parameters
trainer = SFTTrainer(
model=model,
train_dataset=dataset,
peft_config=peft_config,
dataset_text_field="text",
max_seq_length=max_seq_length,
tokenizer=tokenizer,
args=training_arguments,
packing=packing,
)
#Train model
trainer.train()
#Save trained model
trainer.model.save_pretrained(model_adapter)