-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathlstm.py
98 lines (85 loc) · 3.91 KB
/
lstm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import time
import torch
from torch import nn, optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
print(torch.__version__, device)
# @NOTE:
# It is LSTM multiclassification model (10 classes) trained on Fashion-MNIST dataset.
# Fashion-MNIST is a dataset comprising of 28×28 grayscale images.
# Training set has 60,000 images and the test set has 10,000 images.
# https://www.youtube.com/watch?v=J_ksCv_r_rU - explanation
# my best result: [avg test loss: 0.292882; test accuracy: 89.77%]
# @TODO:
# [x] try to move each tensor and layer to device? then try to only move model to device, is there a difference?
# [x] try to remove setting properties of LSTM class of hyperparameters and just use them from where they originally defined. does it matter?
# [.] master shapes of reshape() function
# [.] learn how to save and load model state_dict
# [.] without a tutorial figure out how to train this model on RGB images
# [.] implement the same model in tinygrad and compare ease of use and performance (https://github.com/tinygrad/tinygrad/blob/master/docs/quickstart.md)
# [.] implement transformer in the same way and compare results (https://github.com/norflin321/ml/blob/main/transformer.py)
# load Fashion-MNIST
train_data = datasets.FashionMNIST("./datasets", train=True, download=True, transform=transforms.ToTensor())
test_data = datasets.FashionMNIST("./datasets", train=False, download=True, transform=transforms.ToTensor())
# create Dataloaders with 100 images per batch
batch_size = 100
train_dataloader = DataLoader(train_data, batch_size=batch_size)
test_dataloader = DataLoader(test_data, batch_size=batch_size)
# define hyperparameters
sequence_len = 28
input_len = 28
hidden_size = 128
n_layers = 2
n_classes = 10
n_steps = 10
lr = 0.01
class LSTM(nn.Module):
def __init__(self):
super(LSTM, self).__init__()
self.lstm = nn.LSTM(input_len, hidden_size, n_layers, batch_first=True, device=device)
self.output_layer = nn.Linear(hidden_size, n_classes, device=device)
def forward(self, input):
hidden_states = torch.zeros(n_layers, input.size(0), hidden_size, device=device)
cell_states = torch.zeros(n_layers, input.size(0), hidden_size, device=device)
out, _ = self.lstm(input, (hidden_states, cell_states))
out = self.output_layer(out[:, -1, :])
return out
model = LSTM()
model = model.to(device)
print(model)
loss_fn = nn.CrossEntropyLoss()
# optimizer = optim.SGD(model.parameters(), lr=lr)
optimizer = optim.Adam(model.parameters(), lr=lr)
scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)
def train():
t_start = time.time()
n_batches = len(train_dataloader)
print(f"Start training for {n_steps} steps on dataloader with {n_batches} batches of {batch_size} images in each batch...")
for step in range(n_steps):
total_loss = 0
for images, labels in train_dataloader:
images = images.reshape(-1, sequence_len, input_len)
preds = model(images.to(device))
loss = loss_fn(preds, labels.to(device))
total_loss += loss.item()
optimizer.zero_grad()
loss.backward()
optimizer.step()
scheduler.step()
print(f"-- step: {step+1}; loss: {(total_loss / n_batches):>4f}")
print(f"trained in {int(time.time() - t_start)} sec")
train()
@torch.no_grad()
def test():
n_batches = len(test_dataloader)
total_loss, total_correct = 0, 0
print(f"Start evaluating test dataset...")
for images, labels in test_dataloader:
images = images.reshape(-1, sequence_len, input_len)
preds = model(images.to(device))
loss = loss_fn(preds, labels.to(device))
total_loss += loss.item()
total_correct += (preds.argmax(1) == labels.to(device)).type(torch.float).sum().item()
print(f"avg test loss: {(total_loss/n_batches):>4f}; test accuracy: {((total_correct/len(test_dataloader.dataset)) * 100):>0.2f}%")
test()