-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsilabeador.py
executable file
·353 lines (306 loc) · 14.6 KB
/
silabeador.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
#!/usr/bin/env python3
# http://www.rimar2000.com.ar/
import pandas as pd
import re
import sys
import phonetics
import preprocess
class Silabeador():
def __init__(self, verbose=1, **ph):
'''
Silabeador class counts metric syllables in a text.
Its main function is count_syllables_sentence(sentence)
along with other relevant information throughout the process.
'''
self.verbose = verbose # to print out the result of each function.
# verbose == 1: prints out the sentence and syllable's number retrieved.
# verbose == 2: prints out the result of every function.
# relevant phonetics information used to separate the sentence into metric syllables.
self.vowels = ph['vowels']
self.consonants = ph['consonants']
self.alphabet = ph['alphabet']
self.phonemes_dict = ph['phonemes_dict']
self.char2phone = ph['char2phone'] # hierarchical rules to chage characters to phones
self.punctuation = ph['punctuation']
self.double_consonants = ph['double_consonants']
# variables used within each count_syllables_sentence call
self.sentence = []
self.phonemes = []
self.structure = []
self.structure_syllables = []
self.word_syllables = []
self.number_syllables = 0
self.last_word = {'word': None,\
'phonemes': None,\
'structure': None,\
'word_syllables': None,\
'monosilabo': None,\
'accent': None}
self.sinalefa = False # bool to count using sinalefa
def count_syllables_text(self, text):
syllables_text = []
for line in text:
syllables_text.append(self.count_syllables_sentence(line))
def count_syllables_sentence(self, sentence, debug=False):
'''
main function that takes a sentence as input and counts the number of metric syllables
input: string
output: number of syllables
'''
self.sentence = sentence.split()
# converting characters to phonemes (i.e. 'cazar' --> 'kasar')
self.phonemes = [self.word2phonemes(word) for word in self.sentence]
# converting phonemes to structure (i.e. 'kasar' --> 'CFCFC')
self.structure = [self.phonemes2structure(word) for word in self.phonemes]
# dividing the structure in syllables (i.e. 'CFCFC' --> CF-CFC)
self.structure_syllables = [self.divide_structure_syllables(word_structure) for word_structure in self.structure]
# dividing the word in syllables according to how the structure was previously divided (i.e. 'cazar' --> 'ca-zar')
self.word_syllables = [self.add_separator(structure, word) for structure, word in zip(self.structure_syllables, self.phonemes)]
# metric rules are considered when counting syllables in a sentence
self.last_word = self.get_last_word()
if self.sinalefa: # applying sinalefa (i.e. 'vuel-ta-y-me-dia' --> 'vuel-tay-me-dia')
self.word_syllables = self.apply_sinalefa()
self.number_syllables = self.count(self.word_syllables, self.metric_rule(self.last_word))
if debug and self.number_syllables != 8:
print(sentence)
if self.verbose == 1:
print(f'{" ".join(self.sentence)} --> {self.word_syllables} [{self.number_syllables}]')
elif self.verbose == 2:# and self.number_syllables != 8: # add second condition for debugging purposes
print('\nsentence', sentence)
print('phonemes', self.phonemes)
print('structure', self.structure)
print('structure syllables', self.structure_syllables)
print('word syllables', self.word_syllables)
print('number of metric syllables', self.number_syllables)
elif self.verbose == 3:
print('last word:', self.last_word['word'])
print('last word structure', self.last_word['structure'])
print('last word phonemes', self.last_word['phonemes'])
print('last word syllables', self.last_word['word_syllables'])
print('last word accent:', self.last_word['accent'])
print('is last word monosilabo:', self.last_word['monosilabo'])
return self.number_syllables # amount of metric syllables in the sentence
def count_syllables_word(self, word):
self.sentence = word.split()
# converting characters to phonemes (i.e. 'cazar' --> 'kasar')
self.phonemes = [self.word2phonemes(word) for word in self.sentence]
# converting phonemes to structure (i.e. 'kasar' --> 'CFCFC')
self.structure = [self.phonemes2structure(word) for word in self.phonemes]
# dividing the structure in syllables (i.e. 'CFCFC' --> CF-CFC)
self.structure_syllables = [self.divide_structure_syllables(word_structure) for word_structure in self.structure]
# dividing the word in syllables according to how the structure was previously divided (i.e. 'cazar' --> 'ca-zar')
self.word_syllables = [self.add_separator(structure, word) for structure, word in zip(self.structure_syllables, self.phonemes)]
# metric rules are considered when counting syllables in a sentence
self.last_word = self.get_last_word()
return self.last_word
def apply_sinalefa(self):
"""
function that applies sinalefa over word_syllables.
it checks if two contiguous syllables have a vowel and it merges them
in the same syllable.
"""
syllables_sinalefa = []
index = 0
while index < len(self.word_syllables):
try:
# checking if there is sinalefa
if self.are_vowels(syllables_sinalefa[-1][-1], self.word_syllables[index][0]):
merged_syllables = ''.join([syllables_sinalefa[-1], self.word_syllables[index]])
# replacing the last syllable with the merged syllable
syllables_sinalefa.pop(-1)
syllables_sinalefa.append(merged_syllables)
else:
syllables_sinalefa.append(self.word_syllables[index])
except IndexError:
# we reached the last word
syllables_sinalefa.append(self.word_syllables[index])
finally:
index += 1
return '-'.join(syllables_sinalefa)
def are_vowels(self, c1, c2):
"""
function that compares if two characters are vowels.
it returns true if so, otherwise false.
"""
pattern = re.compile(f"[{''.join(self.vowels)}]")
if re.match(pattern, c1) and re.match(pattern, c2):
return True
else:
return False
def word2phonemes(self, word):
"""
function that takes a word and translates it to phonemes
following the rules from char2phone dictionary
"""
# findall retrieves a list of characters only
phonemes = ''.join(re.findall('[^\W]*', word)) # TODO: deal with punctuation in the middle of sentence
for rule in self.char2phone.keys(): # certain rules must be applied first
for char in self.char2phone[rule]:
if char in phonemes:
# modifying the word on the fly
phonemes = re.sub(char, self.char2phone[rule][char], phonemes)
return phonemes
def phonemes2structure(self, phonemes):
"""
function that takes a sequence of phonemes and translates it to its structure
following the mapping in phonemes_dict dictionary.
double consonants are mapped to just one structure character 'T'
"""
#print(phonemes)
phonemes_reduced = self.reduce_double_syllables(phonemes)
structure = [self.phonemes_dict[phone] if phone in self.alphabet else phone for phone in phonemes_reduced]
#print(structure)
return structure
def divide_structure_syllables(self, word_structure, debug=False):
sequence = ''.join(word_structure)
pattern = re.compile(r"""(CDDC(?![AFD]))?
(CDAC(?![AFD]))?
(CFDC)?
(CDD)?
(CDFC(?![AFD]))?
(CDF)?
(CDA)?
(TF)?
(CDC(?![AFD]))?
(DFC)?
(CCF)?
(CFC(?![AFD]))?
(CAC(?![AFD]))?
(TFC(?![AFD]))?
(CFT(?![AFD]))?
(CFC(?![AFD]))?
(TDF)?
(CF)?
(DF)?
(TD)?
(DT)?
(TAC)?
(TA)?
(FC(?![AFD]))?
(FC(?![AFD]))?
(FD)?
(CD(?![AFD]))?
(DC(?=[CT]))?
(CA(?![AFD]))?
(F(?=C[AFD])?)?
(A)?
(D)?
""", flags=re.VERBOSE)
match = re.findall(pattern, sequence)
structure_syllables = '-'.join([syllable for group in match for syllable in group if syllable])
if debug:
print('word structure', word_structure)
print('sequence', sequence)
print('match', match)
print('structure syllable', structure_syllables)
return structure_syllables
def reduce_double_syllables(self, word):
# TODO: add some exceptions
word = [char for char in word]
for char in range(len(word)):
if ''.join(word[char:char+2]) in self.double_consonants:
# add a more elegant exception here if double consonants 'ns' is followed by vowels or not.
# examples to fix: constitución (working), funsión (not working)
if re.search(r'nsi', ''.join(word)):
continue
word[char] = 'T'
word.pop(char+1)
return ''.join(word)
def add_separator(self, structure, word, debug=False):
"""
function that adds a hyphen to separate the word's syllables.
it takes a separated structure (i.e. F-F-CF) and its word ('aora')
and it inserts the hyphens ('aora' --> 'a-o-ra').
Special cases are structure T which stands for double consonants (i.e. 'ns', 'tr')
thus these cases are mapped to two characters.
"""
word_segmented = list(word)
char = 0
forward = 0
while char < len(structure) - 1:
if structure[char] == '-':
word_segmented.insert(char + forward, '-')
elif structure[char] == 'T':
forward += 1 # moving one forward because T is mapped to two characters
char += 1
if debug:
print(structure, word, word_segmented, ''.join(word_segmented))
return ''.join(word_segmented)
def metric_rule(self, last_word):
'''
applies metric rule:
+1 if last word is aguda
+0 if last word is grave
-1 if last word is esdrujula
input: self (uses information from dictionary last_word)
output: int [+1, 0, -1]
'''
monosilabo = self.is_monosilabo(last_word['word_syllables'])
accent = self.acentuacion(last_word['phonemes'], last_word['structure'], monosilabo)
self.last_word['monosilabo'] = monosilabo
self.last_word['accent'] = accent
if monosilabo:
return +1
if accent == 'aguda':
return +1
elif accent == 'grave':
return 0
else: # accent == 'esdrujula':
return -1
def acentuacion(self, phonemes, structure, monosilabo=False):
'''
checks if the word is aguda, grave or esdrujula.
input: word as a tuple of its phonemes and structure.
output: string ['aguda', 'grave', 'esdrujula']
'''
if 'A' in structure:
pos = structure.index('A')
if (structure[-1] == 'A') or (structure[-2] == 'A' and phonemes[-1] in ['n','s']):
return 'aguda'
elif structure[pos:].count('D') + structure[pos:].count('F') >= 2:
return 'esdrujula'
else:
return 'grave'
else:
if monosilabo:
return 'none' # ???
elif phonemes[-1] in (self.vowels + ['n','s']):
return 'grave'
else:
return 'aguda'
def is_monosilabo(self, word_syllables):
'''
checks if structure is a monosilabo.
input: structure of a word
output: boolean
'''
if len(word_syllables.split('-')) > 1:
return False
else:
return True
def get_last_word(self):
last_word = {'word':self.sentence[-1],\
'phonemes': self.phonemes[-1],\
'structure': self.structure[-1],\
'word_syllables': self.word_syllables[-1]}
return last_word
def count(self, word_syllables, last_word_count):
number_syllables = len(word_syllables.split('-'))
total_syllables = number_syllables + last_word_count
return total_syllables
if __name__ == '__main__':
# ph is a dictionary with information regarding vowels, alphabet, char2phone rules, etc.
ph = phonetics.phonetics
text = preprocess.read_txt('data/decima2.txt')
#text = preprocess.read_txt('cases.txt')
#text = preprocess.read_txt('data/decimas_data_small.txt')
silabeador = Silabeador(**ph, verbose=2)
silabeador.sinalefa = True # counting using sinalefa
#silabeador.count_syllables_sentence(text[11])
silabeador.count_syllables_text(text)
silabeador.count_syllables_sentence('funsión')
#silabeador.divide_structure_syllables(['C', 'D', 'T', 'D', 'A', 'C'], debug=True)
#silabeador.add_separator('CF-CA-F', 'maría', debug=True)
# sinéresis: dos vocales que no forman diptongo, forman diptongo. (e.g. gor-je-ar --> gor-jear)
# diéresis: se separan dos vocales que forman diptongo. (e.g. sua-ve --> su-a-ve)
# sinalefa: dos vocales de palabras contiguas se diptonguean. (e.g. cie-lo-y-mar --> cie-loy-mar)