Skip to content

Commit

Permalink
added infowars transcripts for enhanced brainrot. added some parsing …
Browse files Browse the repository at this point in the history
…of them.
  • Loading branch information
Persephone Karnstein authored and Persephone Karnstein committed Jun 18, 2023
1 parent 0a345ca commit b8c43de
Show file tree
Hide file tree
Showing 6 changed files with 57 additions and 4 deletions.
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[submodule "infowars"]
path = infowars
url = https://github.com/Fudge/infowars.git
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,14 @@ The world is getting worse and transphobes have no imaginiation so I figured a c

## Isn't this just a hatespeech generator?

look ok when you say it like that it sounds like a much less funny idea ok. I'm trying to put in some texts into the training data from conspiracy theory pages and stuff to ensure it gets absolutely batshit rather than just parroting ideology but at the end of the day it will be down to the user to prevent misuse
look ok when you say it like that it sounds like a much less funny idea ok. That's why I'm adding Infowars and stuff into the training data, and trying to find stuff from conspiracy theory pages -- to ensure it gets absolutely batshit rather than just parroting ideology. But at the end of the day it will be down to the user to prevent misuse

![](pride-line.png)

## Why all the rainbow line breaks?

cuz i'm gay

and like to reenforce that i'm not making this because I agree with terfs in any way

![](pride-line.png)
1 change: 1 addition & 0 deletions infowars
Submodule infowars added at 94192e
4 changes: 4 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,6 @@
keras
tensorflow
protobuf==3.20.*
sentencepiece
deepmultilingualpunctuation
alive-process
31 changes: 31 additions & 0 deletions src/terfy/clean_infowars.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
from deepmultilingualpunctuation import PunctuationModel
from alive_progress import alive_bar
import os,re,glob

model = PunctuationModel()

def clean_text():
path = os.getcwd()
files = glob.glob(path + '/../infowars/*.txt')
data = ""
timestamp = re.compile(r'^\s*\[\d{1,2}(:\d{2})?:\d{2}\.\d{3} --> \d{1,2}(:\d{2})?:\d{2}\.\d{3}\]\s+')
with alive_bar(len(files)) as bar:
for f in files:
result = ""
for line in open(f).readlines():
result += " "+timestamp.sub('', line).strip() #remove the timestamps
result = model.restore_punctuation(result)
data += result
bar()
return data

def save_to_corpus(text):
path = os.getcwd()
path += "/training-texts/alexjones.txt"
if os.path.exists(path):
os.remove(path)
with open(path, "w") as f:
f.write(text)

data = clean_text()
save_to_corpus(data)
18 changes: 15 additions & 3 deletions src/terfy/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,8 @@ def generate_text(seed_text, next_words, max_sequence_len):
for _ in range(next_words):
token_list = tokenizer.texts_to_sequences([seed_text])[0]
token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
predicted = model.predict_classes(token_list, verbose=0)

# predicted = model.predict_classes(token_list, verbose=0)
predicted = (model.predict(token_list) > 0.5).astype("int32")
output_word = ""
for word, index in tokenizer.word_index.items():
if index == predicted:
Expand All @@ -81,6 +81,15 @@ def get_corpus_data():
data += open(f).read()
return data

def save_model(model):
# serialize model to JSON
model_json = model.to_json()
with open("model.json", "w") as json_file:
json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("model.h5")
# print("Saved model to disk")


with alive_bar(title="\033[38;5;14m[INFO]\033[0m Compiling corpus...".ljust(35), stats=False, monitor=False) as bar:
data = get_corpus_data()
Expand All @@ -89,8 +98,11 @@ def get_corpus_data():
predictors, label, max_sequence_len, total_words = dataset_preparation(data)

# with alive_bar(title="\033[38;5;14m[INFO]\033[0m Generating model...".ljust(30), stats=False, monitor=False) as bar:
print("\033[38;5;14m[INFO]\033[0m Generating model...".ljust(35))
print("\033[38;5;14m[INFO]\033[0m Training model...".ljust(35))
model = create_model(predictors, label, max_sequence_len, total_words)

with alive_bar(title="\033[38;5;14m[INFO]\033[0m Saving model...".ljust(35), stats=False, monitor=False) as bar:
save_model(model)

with alive_bar(title="\033[38;5;14m[INFO]\033[0m Generating text...".ljust(35), stats=False, monitor=False) as bar:
print(generate_text("we naughty", 3, max_sequence_len))

0 comments on commit b8c43de

Please sign in to comment.