-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
103 lines (86 loc) · 5.88 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
from vowel_consonant_classifier import svd_method
from abjad_alphabet_classifier import KL_based_method, count_method
from sukhotin import sukhotins_method
from preprocess import preprocess
def calculate_precision_recall_accuracy(correct_vowels, correct_consonants, putative_vowels, putative_consonants):
tp = len(set(correct_vowels).intersection(set(putative_vowels))) # true positive
fp = len(set(correct_consonants).intersection(set(putative_vowels))) # false positive
fn = len(set(correct_vowels).intersection(set(putative_consonants))) # false negative
tn = len(set(correct_consonants).intersection(set(putative_consonants))) # true negative
if (tp + fp) == 0 or (tp + fn) == 0:
return
precision = tp / (tp + fp)
recall = tp / (tp + fn)
accuracy = (tp + tn) / (tp + tn + fp + fn)
print("putative vowels: " + str(putative_vowels) + "\nputative consonants: " + str(putative_consonants) +
"\nPRECISION: " + str(precision) + "\nRECALL: " + str(recall) + "\nACCURACY: " + str(accuracy) + "\n")
def calculate_token_precision_recall_accuracy(correct_vowels, correct_consonants, putative_vowels,
putative_consonants, num_tokens, all_letters):
tp, fp, fn, tn = 0, 0, 0, 0
tp_set = set(correct_vowels).intersection(set(putative_vowels))
fp_set = set(correct_consonants).intersection(set(putative_vowels))
fn_set = set(correct_vowels).intersection(set(putative_consonants))
tn_set = set(correct_consonants).intersection(set(putative_consonants))
for k in range(len(num_tokens)):
if all_letters[k] in tp_set:
tp += num_tokens[k]
elif all_letters[k] in fp_set:
fp += num_tokens[k]
elif all_letters[k] in fn_set:
fn += num_tokens[k]
elif all_letters[k] in tn_set:
tn += num_tokens[k]
if (tp + fp) == 0 or (tp + fn) == 0:
return
precision = tp/(tp+fp)
recall = tp/(tp+fn)
accuracy = (tp + tn)/(tp + tn + fp + fn)
#f1_measure = 2*(precision*recall)/(precision + recall)
print("putative vowels: " + str(putative_vowels) + "\nputative consonants: " + str(putative_consonants) + \
"\nTOKEN PRECISION: " + str(precision) + "\nTOKEN RECALL: " + str(recall) + \
"\nTOKEN ACCURACY: " + str(accuracy) + "\n")
if __name__ == "__main__":
""" SAMPLE TEXT SOURCES:
ENGLISH - http://www.gutenberg.org/files/11/11-h/11-h.htm
FRENCH - http://www.gutenberg.org/cache/epub/4650/pg4650.html
GREEK - http://www.gutenberg.org/files/39536/39536-h/39536-h.htm
ARABIC - http://www.gutenberg.org/cache/epub/43007/pg43007.html
https://ar.wikipedia.org/wiki/%D9%84%D8%BA%D8%A9_%D8%B9%D8%B1%D8%A8%D9%8A%D8%A9"""
languages = [('Texts/english.txt', 'Modern English', ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',\
'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'], ['a', 'e', 'i', 'o', 'u'], ['b', 'c', \
'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n', 'p', 'q', 'r', 't', 'v', 'x', 'z']), \
('Texts/french.txt', 'Modern French', ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', \
'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'à', 'â', 'æ', 'ç', 'é', 'è', 'ê', 'ë', \
'î', 'ï', 'ô', 'œ', 'ù', 'û', 'ü'], ['a', 'e', 'i', 'o', 'u', 'à', 'â', 'æ', 'é', 'è', 'ê', 'ë', 'î', \
'ï', 'ô', 'œ', 'ù', 'û', 'ü'], ['b', 'c', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n', 'p', 'q', 'r', \
's', 't', 'v', 'x', 'z']), \
('Texts/greek.txt', 'Modern Greek', ['α', 'β', 'γ', 'δ', 'ε', 'ζ', 'η', 'θ', 'ι', 'κ', 'λ', 'μ', 'ν', 'ξ', \
'ο', 'π', 'ρ', 'σ', 'ς', 'τ', 'υ', 'φ', 'χ', 'ψ', 'ω'], ['α', 'ε', 'η', 'ι', 'ο', 'υ', 'ω'], \
['β', 'γ', 'δ', 'ζ', 'θ', 'κ', 'λ', 'μ', 'ν', 'ξ', 'π', 'ρ', 'σ', 'ς', 'τ', 'φ', 'χ', 'ψ']), \
('Texts/arabic.txt', 'Arabic',\
['غ', 'ظ', 'ض', 'ذ', 'خ', 'ث', 'ت', 'ش', 'ر', 'ق', 'ص', 'ف', 'ع', 'س', 'ن', 'م', 'ل', 'ك', 'ي',\
'ط', 'ح', 'ز', 'و', 'ه', 'د', 'ج', 'ب', 'أ'], [], [])]
for i in range(len(languages)):
#preprocess(languages[i][0], languages[i][2])
all_letters = languages[i][2]
correct_vowels = languages[i][3]
correct_consonants = languages[i][4]
print("\nClassifying vowels and consonants of " + languages[i][1] + " using SVD-based method...\n")
corpus = open(languages[i][0], 'r', encoding='utf-8')
weighted_A, svd_vowels, svd_consonants, num_tokens = svd_method(corpus, all_letters)
calculate_precision_recall_accuracy(correct_vowels, correct_consonants, svd_vowels, svd_consonants)
calculate_token_precision_recall_accuracy(correct_vowels, correct_consonants, svd_vowels,
svd_consonants, num_tokens, all_letters)
corpus.close()
print("\nClassifying vowels and consonants of " + languages[i][1] + " using Sukhotin's method...\n")
corpus = open(languages[i][0], 'r', encoding='utf-8')
sukhotin_vowels, sukhotin_consonants = sukhotins_method(corpus, all_letters)
corpus.close()
calculate_precision_recall_accuracy(correct_vowels, correct_consonants, sukhotin_vowels, sukhotin_consonants)
if len(svd_vowels) > len(svd_consonants):
KL_based_method(weighted_A, all_letters, svd_consonants, svd_vowels)
else:
KL_based_method(weighted_A, all_letters, svd_vowels, svd_consonants)
corpus = open(languages[i][0], 'r', encoding='utf-8')
count_method(corpus, set(svd_vowels), set(svd_consonants))
corpus.close()