-
Notifications
You must be signed in to change notification settings - Fork 23
/
Copy pathambigqa_evaluate_script.py
299 lines (263 loc) · 13.6 KB
/
ambigqa_evaluate_script.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
import os
import re
import json
import string
import argparse
import numpy as np
#from collections import Counter, defaultdict
from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
from pycocoevalcap.bleu.bleu import Bleu
tokenizer = PTBTokenizer()
class QAPairEvaluation(object):
def __init__(self, reference, prediction, metrics="all"):
'''
:param: samples: a list of annotated data
:param: predictions: a dictionary with id as key and prediction as value
prediction can be either
- a list of strings
- a list of dictionaries with quetion and answer as keys
'''
self.reference = reference
self.prediction = [prediction[sample["id"]] for sample in reference]
self.metrics = metrics
METRICS_ANSWER = ["F1 answer"]
METRICS_QG = ["F1 bleu1", "F1 bleu2", "F1 bleu3", "F1 bleu4", "F1 edit-f1"]
if metrics=="all" and type(self.prediction[0][0])==str:
self.metrics = METRICS_ANSWER
elif metrics=="all":
self.metrics = METRICS_ANSWER+METRICS_QG
assert len(set(self.metrics)-set(METRICS_ANSWER)-set(METRICS_QG))==0
self.QG_METRICS_TO_COMPUTE = [m for m in ["bleu1", "bleu2", "bleu3", "bleu4", "rouge-l", "edit-f1"] if any([metric.endswith(m) for metric in self.metrics])]
if len(self.QG_METRICS_TO_COMPUTE)>0:
# if evaluating QG, tokenize prompt question,
# reference question and predicted question
data_to_tokenize = {}
for i, ref in enumerate(self.reference):
data_to_tokenize["prompt.{}".format(i)] = [{"caption": ref["question"]}]
for j, annotation in enumerate(ref["annotations"]):
if annotation['type']=='multipleQAs':
for k, pair in enumerate(annotation['qaPairs']):
data_to_tokenize["ref.{}.{}.{}".format(i, j, k)] = \
[{'caption': sent.strip()} for sent in pair["question"].split('|') if len(sent.strip())>0]
for i, pred in enumerate(self.prediction):
for j, pair in enumerate(pred):
data_to_tokenize["gen.{}.{}".format(i, j)] = [{"caption": pair["question"]}]
all_tokens = tokenizer.tokenize(data_to_tokenize)
for key, values in all_tokens.items():
values = {'sent': [normalize_answer(value) for value in values]}
if key.startswith("prompt."):
i = key.split(".")[1]
self.reference[int(i)]["question"] = values
elif key.startswith("ref."):
i, j, k = key.split('.')[1:]
self.reference[int(i)]["annotations"][int(j)]["qaPairs"][int(k)]["question"] = values
elif key.startswith("gen."):
i, j = key.split(".")[1:]
self.prediction[int(i)][int(j)]["question"] = values
else:
raise NotImplementedError()
self.is_multi = [not any([ann["type"]=="singleAnswer" for ann in ref["annotations"]]) \
for ref in self.reference]
self.results = [self.get_all_metrics(idx) for idx in range(len(self.reference))]
def print_all_metrics(self):
for metric in self.metrics:
result = [e[metric] for e in self.results]
result_multi_only = [e[metric] for e, is_multi in zip(self.results, self.is_multi) \
if is_multi]
if metric=="F1 answer":
print ("%s\t%.3f (all)\t%.3f (multi only)" % (metric, np.mean(result), np.mean(result_multi_only)))
else:
print ("%s\t%.3f" % (metric, np.mean(result_multi_only)))
def get_metric(self, metric):
return np.mean([e[metric] for e in self.results])
def get_all_metrics(self, idx):
evaluation = {}
promptQuestion = self.reference[idx]["question"]
annotations = self.reference[idx]["annotations"]
if type(self.prediction[idx][0])==dict:
# prediction contains a set of question-answer pairs
predictions = [pair["answer"] for pair in self.prediction[idx]]
questions = [pair["question"] for pair in self.prediction[idx]]
else:
# prediction contains a set of answers
predictions = self.prediction[idx]
questions = None
for annotation in annotations:
# iterate each annotation and take the maximum metrics
if annotation['type']=='singleAnswer':
f1 = get_f1([annotation['answer']], predictions)
for metric in self.metrics:
if metric.startswith('F1'):
evaluation[metric] = max(evaluation.get(metric, 0), f1)
elif annotation['type']=='multipleQAs':
matching_pairs = []
evaluation['F1 answer'] = max(evaluation.get("F1 answer", 0),
get_f1([answer['answer'] for answer in annotation['qaPairs']], predictions))
if questions is None:
# skip the below if not evaluating QG
continue
for i, answer in enumerate(annotation["qaPairs"]):
for j, prediction in enumerate(predictions):
# get every reference-prediction pair with the correct answer prediction
em = get_exact_match(answer['answer'], prediction)
if em:
qg_evals = get_qg_metrics(questions[j],
answer['question'],
promptQuestion,
self.QG_METRICS_TO_COMPUTE)
matching_pairs.append((i, j, qg_evals))
def _get_qg_f1(metric_func):
curr_matching_pairs = sorted(matching_pairs, key=lambda x: metric_func(x[2]), reverse=True)
occupied_answers = [False for _ in annotation["qaPairs"]]
occupied_predictions = [False for _ in predictions]
tot = 0
# find non-overapping reference-prediction pairs
# that match the answer prediction
# to get the evaluation score
for (i, j, e) in curr_matching_pairs:
if occupied_answers[i] or occupied_predictions[j]:
continue
occupied_answers[i] = True
occupied_predictions[j] = True
tot += metric_func(e)
assert np.sum(occupied_answers)==np.sum(occupied_predictions)
return 2 * tot / (len(occupied_answers)+len(occupied_predictions))
for metric in self.QG_METRICS_TO_COMPUTE:
metric_name = "F1 {}".format(metric)
if metric_name in self.metrics:
e = _get_qg_f1(lambda x: x[metric])
evaluation[metric_name] = max(evaluation.get(metric_name, 0), e)
else:
raise NotImplementedError()
assert len(self.metrics)==len(evaluation), (self.metrics, evaluation.keys())
return evaluation
def get_qg_metrics(generated, question, promptQuestion, metrics):
evaluation = {}
# computing bleu scores
for name, score in zip(['bleu{}'.format(i) for i in range(1, 5)],
Bleu(4).compute_score(question, generated)[0]):
if name in metrics:
evaluation[name] = score
# computing edit-f1 score
if 'edit-f1' in metrics:
def _get_edits(tokens1, tokens2):
allCommon = []
while True:
commons = list(set(tokens1) & set(tokens2))
if len(commons)==0:
break
allCommon += commons
for c in commons:
ind1, ind2 = tokens1.index(c), tokens2.index(c)
tokens1 = tokens1[:ind1]+tokens1[ind1+1:]
tokens2 = tokens2[:ind2]+tokens2[ind2+1:]
deleted = ["[DELETED]"+token for token in tokens1]
added = ["[ADDED]"+token for token in tokens2]
common = ["[FIXED]"+token for token in allCommon]
return deleted+added #+common
assert len(generated)==len(promptQuestion)==1
generated = generated["sent"][0].split(" ")
promptQuestion = promptQuestion["sent"][0].split(" ")
prediction = _get_edits(promptQuestion, generated)
edit_f1 = 0
for _question in question["sent"]:
_question = _question.split(" ")
reference = _get_edits(promptQuestion, _question)
# now compare the reference edits and predicted edits
if len(reference)==len(prediction)==0:
# rarely, reference has no edits after normalization
# then, if the prediction also has no edits, it gets full score
edit_f1 = 1
elif len(reference)==0 or len(prediction)==0:
# if only one of them has no edits, zero score
edit_f1 = max(edit_f1, 0)
else:
# otherwise, compute F1 score between prediction and reference
edit_f1 = max(edit_f1, get_f1(prediction, reference, is_equal=lambda x, y: x==y))
evaluation["edit-f1"] = edit_f1
assert len(metrics)==len(evaluation)
return evaluation
def get_exact_match(answers1, answers2):
if type(answers1)==list:
if len(answers1)==0:
return 0
return np.max([get_exact_match(a, answers2) for a in answers1])
if type(answers2)==list:
if len(answers2)==0:
return 0
return np.max([get_exact_match(answers1, a) for a in answers2])
return (normalize_answer(answers1) == normalize_answer(answers2))
def normalize_answer(s):
def remove_articles(text):
return re.sub(r'\b(a|an|the)\b', ' ', text)
def white_space_fix(text):
return ' '.join(text.split())
def remove_punc(text):
exclude = set(string.punctuation)
return ''.join(ch for ch in text if ch not in exclude)
def lower(text):
return text.lower()
return white_space_fix(remove_articles(remove_punc(lower(s))))
def get_f1(answers, predictions, is_equal=get_exact_match):
'''
:answers: a list of list of strings
:predictions: a list of strings
'''
assert len(answers)>0 and len(predictions)>0, (answers, predictions)
occupied_answers = [False for _ in answers]
occupied_predictions = [False for _ in predictions]
for i, answer in enumerate(answers):
for j, prediction in enumerate(predictions):
if occupied_answers[i] or occupied_predictions[j]:
continue
em = is_equal(answer, prediction)
if em:
occupied_answers[i] = True
occupied_predictions[j] = True
assert np.sum(occupied_answers)==np.sum(occupied_predictions)
a, b = np.mean(occupied_answers), np.mean(occupied_predictions)
if a+b==0:
return 0
return 2*a*b/(a+b)
def load_reference(reference_path):
if os.path.exists(reference_path):
with open(reference_path, "r") as f:
reference = json.load(f)
if not (type(reference)==list and \
all([type(ref)==dict and "id" in ref and "question" in ref and "annotations" in ref and \
type(ref["question"])==str and type(ref["annotations"])==list and \
all([type(ann)==dict and ann["type"] in ["singleAnswer", "multipleQAs"] for ann in ref["annotations"]]) \
for ref in reference])):
raise Exception("Reference file {} is wrong".format(reference_path))
else:
raise Exception("Reference file {} not found".format(reference_path))
return reference
def load_prediction(prediction_path, ids):
if os.path.exists(prediction_path):
with open(prediction_path, "r") as f:
prediction = json.load(f)
if str(list(prediction.keys())[0])==int:
prediction = {str(key):value for key, value in prediction.items()}
if type(list(prediction.values())[0])==str:
prediction = {key:[value] for key, value in prediction.items()}
if not (type(prediction)==dict and \
len(ids-set(prediction.keys()))==0):
raise Exception("Prediction file {} is wrong".format(prediction_path))
if not (all([type(pred)==list for pred in prediction.values()]) and \
(all([type(p)==str for pred in prediction.values() for p in pred]) or \
all([type(p)==dict and "question" in p and "answer" in p \
and type(p["question"])==type(p["answer"])==str for pred in prediction.values() for p in pred]))):
raise Exception("Prediction file {} has a wrong format".format(prediction_path))
else:
raise Exception("Prediction file {} not found".format(prediction_path))
return prediction
if __name__=="__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--reference_path', type=str, required=True)
parser.add_argument('--prediction_path', type=str, required=True)
args = parser.parse_args()
reference = load_reference(args.reference_path)
ids = set([d["id"] for d in reference])
prediction = load_prediction(args.prediction_path, ids)
evaluation = QAPairEvaluation(reference, prediction)
evaluation.print_all_metrics()