-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathabjad_alphabet_classifier.py
80 lines (67 loc) · 3.08 KB
/
abjad_alphabet_classifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import numpy as np
import math
def normalize(l):
dot = 0
for i in range(len(l)):
dot += l[i]**2
length = math.sqrt(dot)
return [0 if length == 0 else num/length for num in l]
def isNaN(num):
return num != num
def KL_based_method(weighted_A, letters, v, c):
total, delta = 0.0, 0.0
num_contexts = len(weighted_A[0])
for i in range(len(letters)):
if letters[i] not in v:
continue
let_i = np.asarray(normalize([delta if isNaN(num) else num+delta for num in weighted_A[i]]), dtype=np.float)
for j in range(len(letters)):
if sum(weighted_A[j]) == 0:
continue
let_j = np.asarray(normalize([delta if isNaN(num) else num+delta for num in weighted_A[j]]), dtype=np.float)
if letters[j] in v and i != j:
for index in range(num_contexts):
if let_i[index] != 0 and let_j[index] != 0:
total += abs(let_i[index] * np.log(let_i[index] / let_j[index]))
elif letters[j] in c and i != j:
for index in range(num_contexts):
if let_i[index] != 0 and let_j[index] != 0:
total -= abs(let_i[index] * np.log(let_i[index] / let_j[index]))
print("\nKL-based method of abjad and alphabet classification result: " + str(total) +
"\nNote that results for the KL-based method must be compared across corpora with approximately "
"the same number of words.")
def count_method(corpus, putative_vowels, putative_consonants):
'''Calculating percentage of words in the corpus without
putative vowels and percetage of words without putative
consonants'''
num_words = 0
num_words_without_putative_vowels = 0
num_words_without_putative_consonants = 0
count = 0
while 1:
input_line = corpus.readline()
if not input_line or count > 5000:
break
line = input_line.split(' ')
for word in line:
count += 1
if count > 5000:
break
num_words += 1
if word == '':
continue
word = set(list(word))
#print(word,putative_vowels,putative_consonants)
intersection_vowels = len(word.intersection(putative_vowels))
intersection_consonants = len(word.intersection(putative_consonants))
if intersection_vowels == 0 and intersection_consonants == 0:
num_words -= 1
continue
if intersection_vowels == 0:
num_words_without_putative_vowels += 1
if intersection_consonants == 0:
num_words_without_putative_consonants += 1
print("\nCount method: the percentage of words without putative vowels is " +
str(float(num_words_without_putative_vowels)/float(num_words)) +
" and the percentage of words without putative consonants is " +
str(float(num_words_without_putative_consonants) / float(num_words)))