-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest.py
236 lines (175 loc) · 11.1 KB
/
test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
from preprare_dataset import *
from main import Transformer
import tensorflow as tf
import os
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
# Padding
# What It Is: Padding involves adding extra tokens to sequences to make all sequences in a batch the same length. This is necessary because many models, including transformers, require inputs to be of uniform size.
# Why It’s Needed: Sequences in natural language (e.g., sentences) can have varying lengths. Padding ensures that the model can process batches of sequences efficiently, even if they are not of the same length.
# Example: If you have sentences of lengths 5, 7, and 10, you might pad them all to length 10. So, the sentences might look like [1, 2, 3, 4, 5, 0, 0, 0, 0, 0] where 0 is the padding token.
# Masking
# What It Is: Masking is a technique used to tell the model which tokens should be considered (valid) and which should be ignored (padding tokens) during processing.
# Why It’s Needed: Padding tokens are not actual content and should not influence the model’s learning or predictions. Masking helps in focusing only on the real data by ignoring these padding tokens.
# How It’s Done:
# Padding Mask: Used to ignore padding tokens in both the input sequences and the target sequences. For example, in the loss computation, padding tokens should not contribute to the loss.
# Look-ahead Mask: Used in the decoder to ensure that each token can only attend to previous tokens and not future tokens (important for autoregressive models).
# d_model
# Embedding Dimension: In the embedding layer, d_model is the size of the dense vector used to represent each token. For instance, if d_model is 512, each token in the sequence is represented by a 512-dimensional vector.
# Hidden State Dimension: During the forward pass of the Transformer, the hidden states in the encoder and decoder are also of dimension d_model. This dimension is consistent across various layers and components of the model.
# Attention Mechanism: In the multi-head self-attention mechanism, d_model influences how the queries, keys, and values are projected. Each head in multi-head attention operates in a subspace of dimension d_model / num_heads, where num_heads is the number of attention heads.
# Feed-Forward Networks: The position-wise feed-forward networks within the Transformer also operate in this space, typically using a higher-dimensional hidden layer before projecting back to d_model.
# Positional Encoding: The positional encoding vectors are also of dimension d_model, allowing the model to incorporate the positional information into the embeddings
# Embedding
# Each token in the vocabulary is usually mapped to a vector of fixed size using an embedding layer
# This vector represents the token in a continuous space, which the model uses for further processing
# Purpose of BUFFER_SIZE
# 1) Shuffling Data:
# Shuffling: Shuffling is the process of randomly rearranging the order of elements in the dataset. This helps in ensuring that the model does not learn any unintended patterns from the order of the data.
# Buffer Size: The BUFFER_SIZE specifies how many elements should be kept in memory for shuffling. For example, if BUFFER_SIZE is set to 20000, TensorFlow will randomly shuffle these 20000 elements and then sample from this shuffled set to create batches.
# 2) Impact on Performance:
# Large Buffer Size: A larger BUFFER_SIZE means more elements are shuffled at once, which generally improves randomness and the effectiveness of shuffling. However, it also requires more memory.
# Small Buffer Size: A smaller buffer may not shuffle as thoroughly, potentially leading to less effective shuffling. It uses less memory but may result in patterns in the data not being sufficiently randomized.
# Batching:
# Definition: Batching is the process of dividing your dataset into smaller, manageable chunks, or "batches." Each batch contains a fixed number of elements from the dataset.
# Purpose: This is essential for training machine learning models because it allows you to process multiple samples simultaneously. Instead of feeding the model one example at a time, you feed it a batch of examples, which makes the training process more efficient.
def create_tokenizer(vocab_size):
# Create a TextVectorization layer with correct standardization option
tokenizer = tf.keras.layers.TextVectorization(
max_tokens=vocab_size, # Maximum vocabulary size
standardize='lower_and_strip_punctuation', # Corrected standardization method
split='whitespace', # Tokenization method
output_mode= 'int'
)
return tokenizer
def tokenize(texts, tokenizer):
# Apply tokenizer to text data
return tokenizer(texts)
def pad_sequence(seq, max_len):
seq_len = tf.shape(seq)[0] # Get the current sequence length
padding_len = max_len - seq_len # Calculate how much padding is needed
if padding_len > 0:
# Pad with zeros (assuming 0 is the padding token)
padding = tf.zeros([padding_len], dtype=tf.int64) # Create the padding tensor
seq = tf.concat([seq, padding], axis=0) # Concatenate the sequence and the padding
return seq
def get_max_sequence_length(dataset):
max_length = 0
# Iterate through the dataset and find the max length
for batch in dataset:
# Assuming `batch` is a tokenized sequence (could be a tuple if the dataset has inputs and targets)
# If it's a tuple, you can access batch[0] or batch[1] depending on the structure
sequence_length = tf.shape(batch)[1] # Length of the sequence (assuming shape is [batch_size, seq_length])
max_length = max(max_length, tf.reduce_max(sequence_length).numpy())
return max_length
# The loss should ignore padding tokens
# We'll use a mask to compute loss only on valid tokens (non-padding tokens).
def loss_function(real, pred, mask):
# Compute loss only on valid tokens (ignore padding)
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
loss_ = loss_object(real, pred)
mask = tf.cast(mask, dtype=loss_.dtype)
loss_ *= mask
return tf.reduce_sum(loss_) / tf.reduce_sum(mask)
# To calculate accuracy on the non-padded tokens
def accuracy_function(real, pred, mask):
accuracies = tf.equal(real, tf.argmax(pred, axis=2))
mask = tf.cast(mask, dtype=accuracies.dtype)
accuracies = tf.cast(accuracies, dtype=tf.float32)
accuracies *= mask
return tf.reduce_sum(accuracies) / tf.reduce_sum(mask)
# Here, we define a single training step
# We'll use tf.GradientTape to compute gradients and apply them using the optimizer
# and calculate loss and accuracy
@tf.function
def train_step(input, target, transformer, optimizer):
seq_len = len(input)
with tf.GradientTape() as tape:
# Forward pass through the transformer model
predictions = transformer.predict(input, target)
predictions= tf.keras.layers.Input(shape= (seq_len, ), dtype= tf.int64)
# Create a mask for non-padding tokens in the target sequence
# Assuming 0 is the padding token
mask = tf.cast(tf.math.not_equal(target, 0), tf.int64)
# Calculate loss using the real target sequence and model predictions
loss = loss_function(target, predictions, mask)
# Calculate gradients and update the transformer model weights
gradients = tape.gradient(loss, transformer.trainable_variables)
optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))
return loss
# During validation or testing, no gradient calculation is needed
# We'll simply evaluate the model's performance on the validation set
@tf.function
def test_step(input, target, transformer):
# Forward pass without training
predictions = transformer.predict(input, target)
mask = tf.cast(tf.math.not_equal(target, 0), tf.float32)
loss = loss_function(target, predictions, mask)
accuracy = accuracy_function(target, predictions, mask)
return loss, accuracy
# now we can put everything together in a loop to train the Transformer model
EPOCHS = 20
BUFFER_SIZE = 20000
BATCH_SIZE = 64
# training set
train_dataset = tf.data.Dataset.from_tensor_slices((input_train, target_train))
train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
# test set
val_dataset = tf.data.Dataset.from_tensor_slices((input_val, target_val))
val_dataset = val_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
# initilize the hyperparameters of the class Transformer
d_model = 512
num_layers = 6
num_heads= 8
d_ff = 2048
input_vocab_size = 10000
output_vocab_size = 10000
learning_rate = 0.001
max_seq_len_train = get_max_sequence_length(train_dataset)
print(f"The maximum sequence length is: {max_seq_len_train}")
# create the optimizer, tokenizer and transformer objects
optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)
tokenizer = create_tokenizer(input_vocab_size)
transformer = Transformer(num_layers, d_model, num_heads, d_ff, input_vocab_size, output_vocab_size)
# Adapt the tokenizer to the dataset (train_dataset is batched, so we need to handle it carefully)
# We use map to get the input sequences (texts) from the dataset
tokenizer.adapt(train_dataset.map(lambda inp, tar: inp))
for epoch in range(EPOCHS):
print(f"Epoch {epoch + 1}/{EPOCHS}")
total_loss = 0
# Iterate over training batches
# train the model with batchs from training data simultaniosly
for batch, (inp, tar) in enumerate(train_dataset) :
for input, target in zip(inp, tar) : #
print(f'Input: {input}')
print(f'Target: {target}')
# tokenization
src_token = tokenize(input, tokenizer)
tgt_token = tokenize(target, tokenizer)
# padding the sequences
src_mask = pad_sequence(src_token, max_seq_len_train)
tgt_mask = pad_sequence(tgt_token, max_seq_len_train)
print(src_mask ,tgt_mask)
# create the transformer object
batch_loss = train_step(src_mask, tgt_mask, transformer, optimizer)
total_loss += batch_loss
if batch % 50 == 0:
print(f'Batch {batch} Loss {batch_loss:.4f}')
# Validation after each epoch
total_val_loss = 0
total_val_accuracy = 0
num_val_batches = 0
max_seq_len_val = get_max_sequence_length(val_dataset)
print(f"The maximum sequence length is: {max_seq_len_val}")
for batch, (inp, tar) in enumerate(val_dataset):
for input, target in zip(inp, tar) :
src_token = tokenize(input, tokenizer)
tgt_token = tokenize(target, tokenizer)
src_mask = pad_sequence(src_token, max_seq_len_val)
tgt_mask = pad_sequence(tgt_token, max_seq_len_val)
val_loss, val_accuracy = test_step(src_mask, tgt_mask, transformer)
total_val_loss += val_loss
total_val_accuracy += val_accuracy
num_val_batches += 1
avg_val_loss = total_val_loss / num_val_batches
avg_val_accuracy = total_val_accuracy / num_val_batches
print(f'Epoch {epoch + 1} Validation Loss {avg_val_loss:.4f} Accuracy {avg_val_accuracy:.4f}')