-
Notifications
You must be signed in to change notification settings - Fork 21
/
Copy pathtrain.py
159 lines (107 loc) · 5.53 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
'''Training script.
'''
import os
from tqdm import tqdm
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.optim import Adam, lr_scheduler
from torchsummary import summary
from torchvision import transforms
import torch.distributed as dist
import torch.multiprocessing as mp
from models.resnet50 import ResNet50
from runtime_args import args
from load_dataset import LoadDataset
from plot import plot_loss_acc
from helpers import calculate_accuracy
# device = torch.device("cuda:0" if torch.cuda.is_available() and args.device == 'gpu' else 'cpu')
if not os.path.exists(args.graphs_folder) : os.mkdir(args.graphs_folder)
model_save_folder = 'resnet_cbam/' if args.use_cbam else 'resnet/'
if not os.path.exists(model_save_folder) : os.mkdir(model_save_folder)
def train(gpu, args):
'''Init models and dataloaders and train/validate model.
'''
rank = args.rank * args.gpus + gpu
world_size = args.gpus * args.nodes
dist.init_process_group(backend='nccl', init_method='env://', world_size=world_size, rank=rank)
model = ResNet50(image_depth=args.img_depth, num_classes=args.num_classes, use_cbam=args.use_cbam)
torch.cuda.set_device(gpu)
model.cuda(gpu)
optimizer = Adam(model.parameters(), lr=args.learning_rate)
lr_decay = lr_scheduler.ExponentialLR(optimizer, gamma=args.decay_rate)
criterion = torch.nn.CrossEntropyLoss().cuda(gpu)
summary(model, (3, 224, 224))
model = nn.parallel.DistributedDataParallel(model, device_ids=[gpu])
train_dataset = LoadDataset(dataset_folder_path=args.data_folder, image_size=args.img_size, image_depth=args.img_depth, train=True,
transform=transforms.ToTensor())
test_dataset = LoadDataset(dataset_folder_path=args.data_folder, image_size=args.img_size, image_depth=args.img_depth, train=False,
transform=transforms.ToTensor())
train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset, num_replicas=world_size, rank=rank)
# test_sampler = torch.utils.data.distributed.DistributedSampler(test_dataset, num_replicas=world_size, rank=rank)
train_generator = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=False,
num_workers=args.num_workers, pin_memory=True, sampler=train_sampler)
test_generator = DataLoader(test_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers,
pin_memory=True)
training_loss_list = []
training_acc_list = []
testing_loss_list = []
testing_acc_list = []
best_accuracy = 0
for epoch_idx in range(args.epoch):
#Model Training & Validation.
model.train()
epoch_loss = []
epoch_accuracy = []
i = 0
for i, sample in tqdm(enumerate(train_generator)):
batch_x, batch_y = sample['image'].cuda(non_blocking=True), sample['label'].cuda(non_blocking=True)
optimizer.zero_grad()
_,net_output = model(batch_x)
total_loss = criterion(input=net_output, target=batch_y)
total_loss.backward()
optimizer.step()
batch_accuracy = calculate_accuracy(predicted=net_output, target=batch_y)
epoch_loss.append(total_loss.item())
epoch_accuracy.append(batch_accuracy)
curr_accuracy = sum(epoch_accuracy)/(i+1)
curr_loss = sum(epoch_loss)/(i+1)
training_loss_list.append(curr_loss)
training_acc_list.append(curr_accuracy)
print(f"Epoch {epoch_idx}")
print(f"Training Loss : {curr_loss}, Training accuracy : {curr_accuracy}")
model.eval()
epoch_loss = []
epoch_accuracy = []
i = 0
with torch.set_grad_enabled(False):
for i, sample in tqdm(enumerate(test_generator)):
batch_x, batch_y = sample['image'].cuda(non_blocking=True), sample['label'].cuda(non_blocking=True)
_,net_output = model(batch_x)
total_loss = criterion(input=net_output, target=batch_y)
batch_accuracy = calculate_accuracy(predicted=net_output, target=batch_y)
epoch_loss.append(total_loss.item())
epoch_accuracy.append(batch_accuracy)
curr_accuracy = sum(epoch_accuracy)/(i+1)
curr_loss = sum(epoch_loss)/(i+1)
testing_loss_list.append(curr_loss)
testing_acc_list.append(curr_accuracy)
print(f"Testing Loss : {curr_loss}, Testing accuracy : {curr_accuracy}")
#plot accuracy and loss graph
plot_loss_acc(path=args.graphs_folder, num_epoch=epoch_idx, train_accuracies=training_acc_list, train_losses=training_loss_list,
test_accuracies=testing_acc_list, test_losses=testing_loss_list)
if epoch_idx % 5 == 0:
lr_decay.step() #decrease the learning rate at every n epoch.
curr_lr = 0
for params in optimizer.param_groups:
curr_lr = params['lr']
print(f"The current learning rate for training is : {curr_lr}")
if best_accuracy < curr_accuracy:
torch.save(model.state_dict(), f"{model_save_folder}model.pth")
best_accuracy = curr_accuracy
print('Model is saved!')
print('\n--------------------------------------------------------------------------------\n')
os.environ['MASTER_ADDR'] = '10.106.15.226'
os.environ['MASTER_PORT'] = '8888'
if __name__ == '__main__':
mp.spawn(train, nprocs=args.gpus, args=(args,))