-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfine-tuner.py
104 lines (84 loc) · 3.23 KB
/
fine-tuner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import pandas as pd
import os
# Load the dataset
data = pd.read_csv("merged.csv")
leetcode_problems = data["Question Text"]
leetcode_solutions = data["Solution_Path"]
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# Load the model and tokenizer
modelpath = "./model-finetuned" if os.path.exists("./model-finetuned") else "gpt2"
tokenizerpath = "./tokenizer-finetuned" if os.path.exists("./tokenizer-finetuned") else "gpt2"
model = GPT2LMHeadModel.from_pretrained(modelpath)
tokenizer = GPT2Tokenizer.from_pretrained(tokenizerpath)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model.resize_token_embeddings(len(tokenizer))
print(f"Model loaded: {modelpath}")
# Dataset definition
class LeetcodeDataset(Dataset):
def __init__(self, leetcode_problems, leetcode_solutions, tokenizer, max_len):
self.leetcode_problems = leetcode_problems
self.leetcode_solutions = leetcode_solutions
self.tokenizer = tokenizer
self.max_len = max_len
def __len__(self):
return len(self.leetcode_problems)
def __getitem__(self, idx):
problem = self.leetcode_problems.iloc[idx]
code = self.leetcode_solutions.iloc[idx]
with open(code, 'r') as file:
solution_code = file.read()
input_text = f"Problem: {problem} Solution:"
input_encoding = self.tokenizer(
input_text,
add_special_tokens=True,
max_length=self.max_len,
padding='max_length',
truncation=True,
return_tensors="pt",
)
output_encoding = self.tokenizer(
solution_code,
add_special_tokens=True,
max_length=self.max_len,
padding='max_length',
truncation=True,
return_tensors="pt",
)
input_ids = input_encoding['input_ids'].squeeze(0)
attention_mask = input_encoding['attention_mask'].squeeze(0)
output_ids = output_encoding['input_ids'].squeeze(0)
return {
'input_ids': input_ids,
'attention_mask': attention_mask,
'labels': output_ids,
}
# Dataloader
dataset = LeetcodeDataset(leetcode_problems, leetcode_solutions, tokenizer, max_len=128)
dataloader = DataLoader(dataset, batch_size=4, shuffle=True)
# Optimizer
optimizer = optim.AdamW(model.parameters(), lr=1e-5)
# Training loop
EPOCHS = 10
for epoch in range(EPOCHS):
model.train()
total_loss = 0
for batch in dataloader:
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels = batch['labels'].to(device)
optimizer.zero_grad()
# Forward pass
outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
loss = outputs.loss
# Backward pass
loss.backward()
optimizer.step()
total_loss += loss.item()
print(f"Epoch {epoch+1}/{EPOCHS}, Loss: {total_loss / len(dataloader)}")
model.save_pretrained("./model-finetuned")
tokenizer.save_pretrained("./tokenizer-finetuned")