-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgenerate_model.py
140 lines (122 loc) · 4.95 KB
/
generate_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import numpy as np
import tensorflow as tf
from pickle import load
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.utils import plot_model
from keras.models import Model, Sequential
from keras.layers import Input
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding
from keras.layers import Dropout
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.layers import concatenate
from keras.callbacks import ModelCheckpoint
from keras.optimizers import Adam
EMBEDDING_DIM = 256
lstm_layers = 2
dropout_rate = 0.2
learning_rate = 0.001
# convert a dictionary of clean descriptions to a list of descriptions
def to_lines(descriptions):
all_desc = list()
for key in descriptions.keys():
[all_desc.append(d) for d in descriptions[key]]
return all_desc
# fit a tokenizer given caption descriptions
def create_tokenizer(descriptions):
lines = to_lines(descriptions)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
return tokenizer
# calculate the length of the description with the most words
def max_length(descriptions):
lines = to_lines(descriptions)
return max(len(d.split()) for d in lines)
# create sequences of images, input sequences and output words for an image
def create_sequences(tokenizer, max_length, desc_list, photo):
vocab_size = len(tokenizer.word_index) + 1
X1, X2, y = [], [], []
# walk through each description for the image
for desc in desc_list:
# encode the sequence
seq = tokenizer.texts_to_sequences([desc])[0]
# split one sequence into multiple X,y pairs
for i in range(1, len(seq)):
# split into input and output pair
in_seq, out_seq = seq[:i], seq[i]
# pad input sequence
in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
# encode output sequence
out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
# store
X1.append(photo)
X2.append(in_seq)
y.append(out_seq)
return np.array(X1), np.array(X2), np.array(y)
# data generator, intended to be used in a call to model.fit_generator()
def data_generator(descriptions, photos, tokenizer, max_length, n_step = 1):
# loop for ever over images
while 1:
# loop over photo identifiers in the dataset
keys = list(descriptions.keys())
for i in range(0, len(keys), n_step):
Ximages, XSeq, y = list(), list(),list()
for j in range(i, min(len(keys), i+n_step)):
image_id = keys[j]
# retrieve the photo feature
photo = photos[image_id][0]
desc_list = descriptions[image_id]
in_img, in_seq, out_word = create_sequences(tokenizer, max_length, desc_list, photo)
for k in range(len(in_img)):
Ximages.append(in_img[k])
XSeq.append(in_seq[k])
y.append(out_word[k])
yield [[np.array(Ximages), np.array(XSeq)], np.array(y)]
def categorical_crossentropy_from_logits(y_true, y_pred):
y_true = y_true[:, :-1, :] # Discard the last timestep
y_pred = y_pred[:, :-1, :] # Discard the last timestep
loss = tf.nn.softmax_cross_entropy_with_logits(labels=y_true,
logits=y_pred)
return loss
def categorical_accuracy_with_variable_timestep(y_true, y_pred):
y_true = y_true[:, :-1, :] # Discard the last timestep
y_pred = y_pred[:, :-1, :] # Discard the last timestep
# Flatten the timestep dimension
shape = tf.shape(y_true)
y_true = tf.reshape(y_true, [-1, shape[-1]])
y_pred = tf.reshape(y_pred, [-1, shape[-1]])
# Discard rows that are all zeros as they represent padding words.
is_zero_y_true = tf.equal(y_true, 0)
is_zero_row_y_true = tf.reduce_all(is_zero_y_true, axis=-1)
y_true = tf.boolean_mask(y_true, ~is_zero_row_y_true)
y_pred = tf.boolean_mask(y_pred, ~is_zero_row_y_true)
accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.argmax(y_true, axis=1),
tf.argmax(y_pred, axis=1)),
dtype=tf.float32))
return accuracy
# define the captioning model
def define_model(vocab_size, max_length):
# feature extractor (encoder)
inputs1 = Input(shape=(4096,))
fe1 = Dropout(0.5)(inputs1)
fe2 = Dense(EMBEDDING_DIM, activation='relu')(fe1)
fe3 = RepeatVector(max_length)(fe2)
# embedding
inputs2 = Input(shape=(max_length,))
emb2 = Embedding(vocab_size, EMBEDDING_DIM, mask_zero=True)(inputs2)
# merge inputs
merged = concatenate([fe3, emb2])
# language model (decoder)
lm2 = LSTM(500, return_sequences=False)(merged)
#lm3 = Dense(500, activation='relu')(lm2)
outputs = Dense(vocab_size, activation='softmax')(lm2)
# tie it together [image, seq] [word]
model = Model(inputs=[inputs1, inputs2], outputs=outputs)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
plot_model(model, show_shapes=True, to_file='model.png')
return model