forked from mgruppi/schme
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgenerate_answers.py
205 lines (155 loc) · 6.11 KB
/
generate_answers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
import os
import numpy as np
from scipy.stats import spearmanr
# Write scores/classes as a tab separated text file
def write_answer(path, targets, scores):
with open(path, 'w', encoding='utf-8') as f_out:
for target in targets:
f_out.write('\t'.join((target, str(scores[target])+'\n')))
def read_feature_file(path):
with open(path, encoding="utf-8") as fin:
data = map(lambda s: s.strip().split("\t"), fin.readlines())
words, x = zip(*data)
return words, np.array(x, dtype=float)
# Read ground truth values
def read_answers(path="test_data_public/"):
languages = ["english", "german", "latin", "swedish"]
ans_bin = dict()
ans_rank = dict()
for language in languages:
with open(os.path.join(path, language, "truth", "binary.txt"), encoding="utf-8") as fin:
# Read truth files
ans_bin[language] = {w: int(y) for w, y in map(lambda s: s.strip().split("\t", 1), fin.readlines())}
with open(os.path.join(path, language, "truth", "graded.txt"), encoding="utf-8") as fin:
ans_rank[language] = {w: float(y) for w, y in map(lambda s: s.strip().split("\t", 1), fin.readlines())}
return ans_bin, ans_rank
# Load list of target words and return it as a list
def load_targets(path):
with open(path, 'r', encoding='utf-8') as f_in:
targets = [line.strip() for line in f_in]
return targets
# Given a dict of answers and a dict of truth, compute evaluation
def eval_binary(answer, truth):
acc = 0
# Compute accuracy
# Also compute true positives, false positives and true negatives
# These values will be used to compute precision and recall
tp = 0; fp = 0; fn =0; tn=0
for word in answer:
if answer[word] == truth[word]:
acc += 1
# Decide if it's true positive
if answer[word] == 1:
tp += 1
else:
tn += 1
else: # got a false, decide if it's positive or negative
if answer[word] == 1: # false positive
fp += 1
else:
fn += 1
acc = acc/len(answer)
precision = tp/(tp+fp+1e-10)
recall = tp/(tp+fn+1e-10)
fallout = fp/(fp+tn+1e-10)
f1 = 2*(precision*recall)/(precision+recall+1e-10)
return acc, precision, recall, f1, fallout
# Evaluate ranking with spearman's rho
def eval_ranking(answer, truth):
words = sorted(list(answer.keys()))
x_ans = [answer[w] for w in words]
x_tru = [truth[w] for w in words]
r, p = spearmanr(x_ans, x_tru)
return r
# p_features: dictionary of str-> array of feature columns
# target_ids: id (index) of words to be evaluated
# use: columns to use, if None then use all columns
def soft_voting(p_features, targets, target_ids, use=None, threshold=0.5):
voting = dict()
labels = dict()
if not use:
use = list(p_features.keys())
for t, tid in zip(targets, target_ids):
voting[t] = sum(p_features[f][tid] for f in use)/len(use)
labels[t] = int(voting[t] > threshold)
return voting, labels
# Estimate a CDF for feature distribution x
# One way this can be done is via sorting arguments according to values,
# getting a sorted array of positions (low to high)
# then normalize this by len(x)
def get_feature_cdf(x):
y = np.argsort(x)
p = np.zeros(len(x))
for i, v in enumerate(y):
p[v] = i
p = p/len(x)
return p
# Load features from feature/ directory and produce answers
def main():
languages = ["english", "german", "latin", "swedish"]
# Read ground truth
truth_bin, truth_rank = read_answers()
# Defines the model parameters for each language
# Features to use in final model
lang_features = \
{
"english": ["cosine"],
"german": ["cosine", "freq_diff"],
"latin": ["cosine", "freq_diff"],
"swedish": ["cosine"]
}
# Set up thresholds for each language
lang_t = \
{
"english": 0.8,
"german": 0.7,
"latin": 0.2,
"swedish": 0.75
}
overall_acc = 0
overall_rank = 0
for language in languages:
# Get features and threshold for current language
use_features = lang_features[language]
t = lang_t[language]
feature_path = "features/%s/" % language
features = dict()
for root, dirs, files in os.walk(feature_path):
for f in files:
feature_name = f.split(".", 1)[0]
words, x = read_feature_file(os.path.join(feature_path, f))
features[feature_name] = x
# Read target words
word_id = {w: i for i, w in enumerate(words)}
target_path = "test_data_public/%s/targets.txt" % language
targets = load_targets(target_path)
target_ids = [word_id[t] for t in targets]
# Compute all CDFs
p_features = {f: get_feature_cdf(features[f]) for f in features}
# Get votes and aggregate votes into final class decision
# based on threshold t
voting, classes = soft_voting(p_features, targets, target_ids,
use=use_features, threshold=t)
# Write answer files
out_task1 = "answer/task1/%s.txt" % language
out_task2 = "answer/task2/%s.txt" % language
if not os.path.exists("answer"):
os.mkdir("answer")
if not os.path.exists("answer/task1"):
os.mkdir("answer/task1")
if not os.path.exists("answer/task2"):
os.mkdir("answer/task2")
# Get accuracy, precision, and recall
acc_bin, prec_bin, rec_bin, f1_bin, fo_bin = eval_binary(classes, truth_bin[language])
# Get spearman rho for ranking
rank_r = eval_ranking(voting, truth_rank[language])
overall_acc += acc_bin
overall_rank += rank_r
print("%s\t| t=%.2f\t| acc=%.3f\t| rank=%.3f"
% (language, t, acc_bin, rank_r))
write_answer(out_task1, targets, classes)
write_answer(out_task2, targets, voting)
print("-"*30)
print("Overall\t| acc=%.3f\t| rank=%.3f" % (overall_acc/4, overall_rank/4))
if __name__ == "__main__":
main()