added infowars transcripts for enhanced brainrot. added some parsing …

…of them.
PersephoneKarnstein · Jun 18, 2023 · b8c43de · b8c43de
1 parent 0a345ca
commit b8c43de
Show file tree

Hide file tree

Showing 6 changed files with 57 additions and 4 deletions.
diff --git a/.gitmodules b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "infowars"]
+	path = infowars
+	url = https://github.com/Fudge/infowars.git
diff --git a/README.md b/README.md
@@ -6,12 +6,14 @@ The world is getting worse and transphobes have no imaginiation so I figured a c
 
 ## Isn't this just a hatespeech generator?
 
-look ok when you say it like that it sounds like a much less funny idea ok. I'm trying to put in some texts into the training data from conspiracy theory pages and stuff to ensure it gets absolutely batshit rather than just parroting ideology but at the end of the day it will be down to the user to prevent misuse
+look ok when you say it like that it sounds like a much less funny idea ok. That's why I'm adding Infowars and stuff into the training data, and trying to find stuff from conspiracy theory pages -- to ensure it gets absolutely batshit rather than just parroting ideology. But at the end of the day it will be down to the user to prevent misuse
 
 ![](pride-line.png)
 
 ## Why all the rainbow line breaks?
 
 cuz i'm gay
 
+and like to reenforce that i'm not making this because I agree with terfs in any way
+
 ![](pride-line.png)
diff --git a/infowars b/infowars
diff --git a/requirements.txt b/requirements.txt
@@ -1,2 +1,6 @@
 keras
 tensorflow
+protobuf==3.20.*
+sentencepiece
+deepmultilingualpunctuation
+alive-process
diff --git a/src/terfy/clean_infowars.py b/src/terfy/clean_infowars.py
@@ -0,0 +1,31 @@
+from deepmultilingualpunctuation import PunctuationModel
+from alive_progress import alive_bar
+import os,re,glob
+
+model = PunctuationModel()
+
+def clean_text():
+    path = os.getcwd()
+    files = glob.glob(path + '/../infowars/*.txt')
+    data = ""
+    timestamp = re.compile(r'^\s*\[\d{1,2}(:\d{2})?:\d{2}\.\d{3} --> \d{1,2}(:\d{2})?:\d{2}\.\d{3}\]\s+')
+    with alive_bar(len(files)) as bar:
+        for f in files:
+            result = ""
+            for line in open(f).readlines():
+                result += " "+timestamp.sub('', line).strip() #remove the timestamps
+            result = model.restore_punctuation(result)
+            data += result
+            bar()
+    return data
+
+def save_to_corpus(text):
+    path = os.getcwd()
+    path += "/training-texts/alexjones.txt"
+    if os.path.exists(path):
+        os.remove(path)
+    with open(path, "w") as f:
+        f.write(text)
+
+data = clean_text()
+save_to_corpus(data)
diff --git a/src/terfy/model.py b/src/terfy/model.py
@@ -63,8 +63,8 @@ def generate_text(seed_text, next_words, max_sequence_len):
 	for _ in range(next_words):
 		token_list = tokenizer.texts_to_sequences([seed_text])[0]
 		token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
-		predicted = model.predict_classes(token_list, verbose=0)
-
+		# predicted = model.predict_classes(token_list, verbose=0)
+		predicted = (model.predict(token_list) > 0.5).astype("int32")		
 		output_word = ""
 		for word, index in tokenizer.word_index.items():
 			if index == predicted:
@@ -81,6 +81,15 @@ def get_corpus_data():
         data += open(f).read()
     return data
 
+def save_model(model):
+	# serialize model to JSON
+	model_json = model.to_json()
+	with open("model.json", "w") as json_file:
+		json_file.write(model_json)
+	# serialize weights to HDF5
+	model.save_weights("model.h5")
+	# print("Saved model to disk")
+
 
 with alive_bar(title="\033[38;5;14m[INFO]\033[0m Compiling corpus...".ljust(35), stats=False, monitor=False) as bar:
 	data = get_corpus_data()
@@ -89,8 +98,11 @@ def get_corpus_data():
 	predictors, label, max_sequence_len, total_words = dataset_preparation(data)
 
 # with alive_bar(title="\033[38;5;14m[INFO]\033[0m Generating model...".ljust(30), stats=False, monitor=False) as bar:
-print("\033[38;5;14m[INFO]\033[0m Generating model...".ljust(35))
+print("\033[38;5;14m[INFO]\033[0m Training model...".ljust(35))
 model = create_model(predictors, label, max_sequence_len, total_words)
 
+with alive_bar(title="\033[38;5;14m[INFO]\033[0m Saving model...".ljust(35), stats=False, monitor=False) as bar:
+	save_model(model)
+
 with alive_bar(title="\033[38;5;14m[INFO]\033[0m Generating text...".ljust(35), stats=False, monitor=False) as bar:
 	print(generate_text("we naughty", 3, max_sequence_len))