forked from anwesh44/gpt-neo-fine-tuning-example
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgpt_neo.py
51 lines (44 loc) · 2.85 KB
/
gpt_neo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import pandas as pd
import torch
from torch.utils.data import Dataset, random_split
from transformers import AutoTokenizer, TrainingArguments, Trainer, AutoModelForCausalLM, IntervalStrategy
torch.manual_seed(42)
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B", bos_token='<|startoftext|>',
eos_token='<|endoftext|>', pad_token='<|pad|>')
model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B").cuda()
model.resize_token_embeddings(len(tokenizer))
descriptions = pd.read_csv('netflix_titles.csv')['description']
max_length = max([len(tokenizer.encode(description)) for description in descriptions])
print("Max length: {}".format(max_length))
class NetflixDataset(Dataset):
def __init__(self, txt_list, tokenizer, max_length):
self.input_ids = []
self.attn_masks = []
self.labels = []
for txt in txt_list:
encodings_dict = tokenizer('<|startoftext|>' + txt + '<|endoftext|>', truncation=True,
max_length=max_length, padding="max_length")
self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))
def __len__(self):
return len(self.input_ids)
def __getitem__(self, idx):
return self.input_ids[idx], self.attn_masks[idx]
dataset = NetflixDataset(descriptions, tokenizer, max_length=max_length)
train_size = int(0.9 * len(dataset))
train_dataset, val_dataset = random_split(dataset, [train_size, len(dataset) - train_size])
training_args = TrainingArguments(output_dir='./results', num_train_epochs=5, logging_steps=5000,
save_strategy=IntervalStrategy.NO,
per_device_train_batch_size=2, per_device_eval_batch_size=2,
warmup_steps=100, weight_decay=0.01, logging_dir='./logs')
Trainer(model=model, args=training_args, train_dataset=train_dataset,
eval_dataset=val_dataset, data_collator=lambda data: {'input_ids': torch.stack([f[0] for f in data]),
'attention_mask': torch.stack([f[1] for f in data]),
'labels': torch.stack([f[0] for f in data])}).train()
generated = tokenizer("<|startoftext|>", return_tensors="pt").input_ids.cuda()
sample_outputs = model.generate(generated, do_sample=True, top_k=50,
bos_token='<|startoftext|>',
eos_token='<|endoftext|>', pad_token='<|pad|>',
max_length=300, top_p=0.95, temperature=1.9, num_return_sequences=20)
for i, sample_output in enumerate(sample_outputs):
print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))