-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
oops i did it again i regenerated the infowars corpus
- Loading branch information
Persephone Karnstein
authored and
Persephone Karnstein
committed
Jun 21, 2023
1 parent
400c06a
commit 0a2dbb2
Showing
11 changed files
with
259 additions
and
29 deletions.
There are no files selected for viewing
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
model_checkpoint_path: "ckpt_2" | ||
all_model_checkpoint_paths: "ckpt_2" |
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,6 @@ | ||
keras | ||
tensorflow | ||
protobuf==3.20.* | ||
protobuf==3.20.3 | ||
sentencepiece | ||
deepmultilingualpunctuation | ||
alive-process | ||
rich |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,211 @@ | ||
#https://www.tensorflow.org/text/tutorials/text_generation | ||
|
||
import tensorflow as tf | ||
|
||
import numpy as np | ||
import os, glob | ||
import time | ||
|
||
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '7' | ||
|
||
def get_corpus_data(): | ||
path = os.getcwd() | ||
files = glob.glob(path + '/training-texts/*.txt') | ||
data = "" | ||
# files = [files[1]] #delete this line, this is just for testing | ||
for f in files: | ||
data += open(f).read() | ||
return data | ||
|
||
text = get_corpus_data() | ||
# length of text is the number of characters in it | ||
print(f'Length of text: {len(text)} characters') | ||
|
||
# Take a look at the first 250 characters in text | ||
# print(text[:250]) | ||
|
||
# The unique characters in the file | ||
vocab = sorted(set(text)) | ||
print(f'{len(vocab)} unique characters') | ||
|
||
example_texts = ['abcdefg', 'xyz'] | ||
|
||
chars = tf.strings.unicode_split(example_texts, input_encoding='UTF-8') | ||
# print(chars) | ||
|
||
ids_from_chars = tf.keras.layers.StringLookup( | ||
vocabulary=list(vocab), mask_token=None | ||
) | ||
|
||
ids = ids_from_chars(chars) | ||
# print(ids) | ||
|
||
chars_from_ids = tf.keras.layers.StringLookup( | ||
vocabulary=ids_from_chars.get_vocabulary(), invert=True, mask_token=None) | ||
|
||
chars = chars_from_ids(ids) | ||
|
||
tf.strings.reduce_join(chars, axis=-1).numpy() | ||
|
||
def text_from_ids(ids): | ||
return tf.strings.reduce_join(chars_from_ids(ids), axis=-1) | ||
|
||
all_ids = ids_from_chars(tf.strings.unicode_split(text, 'UTF-8')) | ||
# all_ids | ||
|
||
ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids) | ||
|
||
seq_length = 100 | ||
sequences = ids_dataset.batch(seq_length+1, drop_remainder=True) | ||
|
||
def split_input_target(sequence): | ||
input_text = sequence[:-1] | ||
target_text = sequence[1:] | ||
return input_text, target_text | ||
|
||
dataset = sequences.map(split_input_target) | ||
|
||
# Batch size | ||
BATCH_SIZE = 64 | ||
|
||
# Buffer size to shuffle the dataset | ||
# (TF data is designed to work with possibly infinite sequences, | ||
# so it doesn't attempt to shuffle the entire sequence in memory. Instead, | ||
# it maintains a buffer in which it shuffles elements). | ||
BUFFER_SIZE = 10000 | ||
|
||
dataset = ( | ||
dataset | ||
.shuffle(BUFFER_SIZE) | ||
.batch(BATCH_SIZE, drop_remainder=True) | ||
.prefetch(tf.data.experimental.AUTOTUNE)) | ||
|
||
# Length of the vocabulary in StringLookup Layer | ||
vocab_size = len(ids_from_chars.get_vocabulary()) | ||
|
||
# The embedding dimension | ||
embedding_dim = 256 | ||
|
||
# Number of RNN units | ||
rnn_units = 1024 | ||
|
||
class MyModel(tf.keras.Model): | ||
def __init__(self, vocab_size, embedding_dim, rnn_units): | ||
super().__init__(self) | ||
self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim) | ||
self.gru = tf.keras.layers.GRU(rnn_units, | ||
return_sequences=True, | ||
return_state=True) | ||
self.dense = tf.keras.layers.Dense(vocab_size) | ||
|
||
def call(self, inputs, states=None, return_state=False, training=False): | ||
x = inputs | ||
x = self.embedding(x, training=training) | ||
if states is None: | ||
states = self.gru.get_initial_state(x) | ||
x, states = self.gru(x, initial_state=states, training=training) | ||
x = self.dense(x, training=training) | ||
|
||
if return_state: | ||
return x, states | ||
else: | ||
return x | ||
|
||
model = MyModel( | ||
vocab_size=vocab_size, | ||
embedding_dim=embedding_dim, | ||
rnn_units=rnn_units) | ||
|
||
for input_example_batch, target_example_batch in dataset.take(1): | ||
example_batch_predictions = model(input_example_batch) | ||
print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)") | ||
|
||
print(model.summary()) | ||
|
||
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1) | ||
sampled_indices = tf.squeeze(sampled_indices, axis=-1).numpy() | ||
|
||
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True) | ||
|
||
example_batch_mean_loss = loss(target_example_batch, example_batch_predictions) | ||
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)") | ||
print("Mean loss: ", example_batch_mean_loss) | ||
|
||
tf.exp(example_batch_mean_loss).numpy() | ||
model.compile(optimizer='adam', loss=loss) | ||
|
||
# Directory where the checkpoints will be saved | ||
checkpoint_dir = './models/training_checkpoints' | ||
# Name of the checkpoint files | ||
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}") | ||
|
||
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint( | ||
filepath=checkpoint_prefix, | ||
save_weights_only=True) | ||
|
||
EPOCHS = 20 | ||
|
||
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback]) | ||
|
||
class OneStep(tf.keras.Model): | ||
def __init__(self, model, chars_from_ids, ids_from_chars, temperature=1.0): | ||
super().__init__() | ||
self.temperature = temperature | ||
self.model = model | ||
self.chars_from_ids = chars_from_ids | ||
self.ids_from_chars = ids_from_chars | ||
|
||
# Create a mask to prevent "[UNK]" from being generated. | ||
skip_ids = self.ids_from_chars(['[UNK]'])[:, None] | ||
sparse_mask = tf.SparseTensor( | ||
# Put a -inf at each bad index. | ||
values=[-float('inf')]*len(skip_ids), | ||
indices=skip_ids, | ||
# Match the shape to the vocabulary | ||
dense_shape=[len(ids_from_chars.get_vocabulary())]) | ||
self.prediction_mask = tf.sparse.to_dense(sparse_mask) | ||
|
||
@tf.function | ||
def generate_one_step(self, inputs, states=None): | ||
# Convert strings to token IDs. | ||
input_chars = tf.strings.unicode_split(inputs, 'UTF-8') | ||
input_ids = self.ids_from_chars(input_chars).to_tensor() | ||
|
||
# Run the model. | ||
# predicted_logits.shape is [batch, char, next_char_logits] | ||
predicted_logits, states = self.model(inputs=input_ids, states=states, | ||
return_state=True) | ||
# Only use the last prediction. | ||
predicted_logits = predicted_logits[:, -1, :] | ||
predicted_logits = predicted_logits/self.temperature | ||
# Apply the prediction mask: prevent "[UNK]" from being generated. | ||
predicted_logits = predicted_logits + self.prediction_mask | ||
|
||
# Sample the output logits to generate token IDs. | ||
predicted_ids = tf.random.categorical(predicted_logits, num_samples=1) | ||
predicted_ids = tf.squeeze(predicted_ids, axis=-1) | ||
|
||
# Convert from token ids to characters | ||
predicted_chars = self.chars_from_ids(predicted_ids) | ||
|
||
# Return the characters and model state. | ||
return predicted_chars, states | ||
|
||
one_step_model = OneStep(model, chars_from_ids, ids_from_chars) | ||
|
||
start = time.time() | ||
states = None | ||
next_char = tf.constant(['the transgender', 'transgender', 'trans', 'I think that trans', 'why', 'I am', 'you are']) | ||
result = [next_char] | ||
|
||
for n in range(1000): | ||
next_char, states = one_step_model.generate_one_step(next_char, states=states) | ||
result.append(next_char) | ||
|
||
result = tf.strings.join(result) | ||
end = time.time() | ||
print(result, '\n\n' + '_'*80) | ||
print('\nRun time:', end - start) | ||
|
||
tf.saved_model.save(one_step_model, 'one_step') | ||
# one_step_reloaded = tf.saved_model.load('one_step') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Large diffs are not rendered by default.
Oops, something went wrong.