-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmarkovp.py
119 lines (91 loc) · 4.69 KB
/
markovp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import os
import random
from hashtable import HashTable
from wordObj import wordObj
def build_lexicon(input_filename, lexicon): #Analyse the text file and populate a hashtable with relevant data
wordCount = 0
uniqueCounter = 0
with open(input_filename, 'r', encoding='utf-8') as file:
for line in file:
words = line.split()
for i, word in enumerate(words):
if len(word) == 0:
continue # skip current iteration if the word is empty
# try to find the following word
try:
nextword = words[i + 1]
print(f"Word \"{word}\" is followed by \"{nextword}\"")
except IndexError:
print(f"Word \"{word}\" is at the end of the line! -------------")
nextword = None
#add current word to the table (if it doesnt already exist)
#if it does exist, increment the frequency
#in either case, we then add the next word to that word's followingWords list
wordObject = wordObj(word)
idx = lexicon.search(wordObject) #return index of the word if it exists in table
if idx == -1: #-1 returned if word doesnt exist in table
wordObject.followingWords.append(nextword)
lexicon.insert(wordObject)
uniqueCounter += 1
else:
lexicon.data[idx].frequency += 1
lexicon.data[idx].followingWords.append(nextword)
wordCount += 1
file.close()
lexicon.print()
print(f"Total corpus words parsed: {wordCount}")
print(f"Total unique words parsed: {uniqueCounter}")
print(f"Writing corpus analysis to file for debugging purposes.")
with open("corpusdebug.txt", 'w', encoding='utf-8') as file:
for i in range(lexicon.size):
if lexicon.data[i] != lexicon.unoccupied:
file.write(str(lexicon.data[i]) + "\n")
file.close()
print("Corpus analysis completed.")
def MarkovP(output_filename, lexicon, desired_length):
#the hashtable most likely contains a lot of empty values, which are represented as "-1"
#to more efficiently process data, we will create a new list containing only the words that have been inserted
def get_starter(lexicon):
choice = random.choice( [word for word in lexicon] ) #pick a random word
return choice
print("Beginning Markov generation.")
lexiconList = []
outputString = ""
#perhaps inefficient and linear, but all values in the hashtable have to be checked once anyway
for i in range(lexicon.size):
if lexicon.data[i] != lexicon.unoccupied:
lexiconList.append(lexicon.data[i])
starter = get_starter(lexiconList)
currentWord = starter
outputString += starter.spelling + " "
words = 0
while words < desired_length:
nextWord = random.choice( currentWord.followingWords )
print(f"Current word: {currentWord.spelling}, Next word: {nextWord}") #this can get a little bit confusing. nextword is whatever we're about to append
if nextWord == None:
seed = get_starter(lexiconList)
nextWord = seed.spelling
print("Reseeding because the word didn't have any words following it. New seed: {seed.spelling}")
outputString += nextWord + " "
#find next word as an object, so we can find its following words for the next iteration
#the next word of this object becomes the current word for the next iteration!
for word in lexiconList:
if word.spelling == nextWord:
currentWord = word
words += 1
print(outputString)
print(f"Writing to file {output_filename}...")
with open(output_filename, 'w', encoding='utf-8') as file:
file.write(outputString)
file.close()
print("Markov generation complete.")
htable = HashTable() #instantiate empty hashtable
input_filename = input("Enter filename containing text corpus (suggested: sherlock.txt): ")
output_filename = input("Enter filename for generated text output (suggested: output.txt): ")
desired_length = int(input("Enter desired length of generated text in words (suggested: 500): "))
if not os.path.isfile(input_filename):
print(f"File {input_filename} could not be opened, or was not found.")
print("Exiting...")
exit()
build_lexicon(input_filename, htable) #Analyse text corpus and populate hashtable
MarkovP(output_filename, htable, desired_length) #Generate markov chain and write to output file