forked from udacity/CVND---Image-Captioning-Project
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmodel.py
138 lines (108 loc) · 5.78 KB
/
model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import torch
import torch.nn as nn
import torchvision.models as models
import torch.nn.functional as F
import numpy as np
class EncoderCNN(nn.Module):
def __init__(self, embed_size):
super(EncoderCNN, self).__init__()
resnet = models.resnet50(pretrained=True)
for param in resnet.parameters():
param.requires_grad_(False)
modules = list(resnet.children())[:-1]
self.resnet = nn.Sequential(*modules)
self.embed = nn.Linear(resnet.fc.in_features, embed_size)
def forward(self, images):
features = self.resnet(images)
features = features.view(features.size(0), -1)
features = self.embed(features)
return features
class DecoderRNN(nn.Module):
def __init__(self, embed_size, hidden_size, vocab_size, feature_shape, num_layers=1):
super(DecoderRNN, self).__init__()
self.dict_size = vocab_size
self.lstm_layers = num_layers
self.dropout_prob = 0.5
self.hidden_dim = hidden_size
self.embedding_dim = embed_size # output feature shape from CNN
self.feature_embeddings = nn.Embedding(self.dict_size, self.embedding_dim)
self.lstm = nn.LSTM(self.embedding_dim, self.hidden_dim, dropout=self.dropout_prob, num_layers=num_layers, batch_first=True)
self.dropout = nn.Dropout(self.dropout_prob)
self.linear = nn.Linear(self.hidden_dim, self.dict_size)
self.batch_dim = feature_shape
# self.hidden_state = self.init_hidden(self.batch_dim)
self.init_weights_linear_layer()
def init_weights_linear_layer(self):
''' Initialize weights for fully connected layer '''
self.feature_embeddings.weight.data.uniform_(-0.1, 0.1)
# Set bias tensor to all zeros
self.linear.bias.data.fill_(0)
# FC weights as random uniform
self.linear.weight.data.uniform_(-1, 1)
def init_hidden(self, batch_dim):
# The axes dimensions are (n_layers, batch_size, hidden_dim)
return (torch.zeros(self.lstm_layers, batch_dim, self.hidden_dim).cuda(),
torch.zeros(self.lstm_layers, batch_dim, self.hidden_dim).cuda())
def forward(self, features, captions):
# to concat image tensor 1 char has to be dropped
captions = captions[:, :-1]
embeddings = self.feature_embeddings(captions)
# print(embeddings.shape)
hc = self.init_hidden(self.batch_dim)
feature_embed = torch.cat((features.unsqueeze(dim=1), embeddings), dim=1)
lstm_out, (h, c) = self.lstm(feature_embed, hc)
# lstm_out, (h, c) = self.lstm(features.unsqueeze(dim=1), hc)
# print(f"lstm_out shape: {lstm_out.shape} h and c: {self.hidden_state[0].shape}, {self.hidden_state[1].shape}")
# lstm_out, (h, c) = self.lstm(embeddings.reshape(self.batch_dim, -1, self.embedding_dim), (h, c))
# print(f"lstm_out shape: {lstm_out.shape} h and c: {self.hidden_state[0].shape}, {self.hidden_state[1].shape}")
linear_out_drop = self.dropout(lstm_out)
linear_out_drop = self.linear(linear_out_drop.reshape(-1, self.hidden_dim))
x = linear_out_drop.reshape(self.batch_dim, -1, self.dict_size)
return x
# def sample(self, inputs, states=None, max_len=20):
# " accepts pre-processed image tensor (inputs) and returns predicted sentence (list of tensor ids of length max_len) "
# print(f"input dim: {inputs.shape}")
# sent_idx = []
# hc = states
# top_word_idx = torch.zeros(1, 20)
# for idx in range(max_len):
# # while True:
# lstm_out, hc = self.lstm(inputs, hc)
# # print(f"lstm_out dim: {lstm_out.shape}, hidden shape: {hc[0].shape}, {hc[1].shape}")
# linear_out = self.linear(lstm_out.reshape(-1, self.hidden_dim))
# # tag_scores = F.log_softmax(linear_out, dim=1)
# pred, top_word_idx = torch.max(linear_out, dim=1)
# print(f"pred score: {pred}, word idx: {top_word_idx}, {top_word_idx.shape}")
# inputs = self.feature_embeddings(top_word_idx)
# inputs = inputs.unsqueeze(dim=1)
# top_word_idx = top_word_idx.squeeze().to("cpu").tolist()
# # if top_word_idx == 1:
# sent_idx.append(top_word_idx)
# # break
# # print(f"top word idx shape: {len(top_word_idx)}")
# print(f"len of sent idx: {len(sent_idx)}")
# return sent_idx
def sample(self, inputs, states=None, max_len=20):
"""
accepts pre-processed image tensor (inputs) and returns predicted sentence (list of tensor ids of length max_len)
a simple implementation of beam search
"""
sent_idx = []
hc = states
top_word_idx = torch.zeros(1, 20)
for idx in range(max_len):
lstm_out, hc = self.lstm(inputs, hc)
# print(f"lstm_out dim: {lstm_out.shape}, hidden shape: {hc[0].shape}, {hc[1].shape}")
linear_out = self.linear(lstm_out.reshape(-1, self.hidden_dim))
pred, top_word_idx = torch.topk(input=linear_out, k=2)
pred = pred.cpu().detach().numpy().squeeze(0)
top_word_idx = top_word_idx.cpu().detach().numpy()
top_word_idx = np.random.choice(top_word_idx.squeeze(0), p=pred/pred.sum())
top_word_idx = torch.from_numpy(np.expand_dims(top_word_idx.astype('float'), axis=0)).long().cuda()
print(f"pred score: {pred}, word idx: {top_word_idx}")
inputs = self.feature_embeddings(top_word_idx)
inputs = inputs.unsqueeze(dim=1)
top_word_idx = top_word_idx.squeeze().to("cpu").tolist()
sent_idx.append(top_word_idx)
print(f"len of sentence token: {len(sent_idx)}")
return sent_idx