This repository has been archived by the owner on Jan 8, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathcorpus.py
executable file
·280 lines (211 loc) · 8.64 KB
/
corpus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
#!/usr/bin/env python3
"""
Module to manage the VUAM Corpus
"""
from collections import OrderedDict
from csv import reader, DictReader
from itertools import chain
class VUAMC():
"""
Takes in the VUAMC CSV files and Token CSV training/test files
and represents them as Objects.
Example input files:
$ head vuamc_corpus_train.csv
"txt_id","sentence_id","sentence_txt"
"a1e-fragment01","1","Latest corporate unbundler M_reveals laid-back M_approach : Roland Franklin , who is M_leading a 697m pound break-up bid for DRG , talks M_to Frank Kane"
$ head all_pos_tokens_train_gold_labels.csv
a1h-fragment06_116_1,0
a1h-fragment06_116_2,0
a1h-fragment06_116_5,1
"""
def __init__(self, vuamc_file, tokens_file, tags_file, mode='train'):
"""
:param string vuamc_file: Test/Train VUAMC file as csv
:param string tokens_file: Test/Train Tokens file as csv
:param string mode: "train" will read labels from tokens_file
:return: VUAMC Object
"""
self.delimiter = ','
self.quotechar = '"'
self.mode = mode
self._sentences = None
self._token_list = None
self._label_list = None
self._pos_list = None
self.vuamc_file = vuamc_file
self.tokens_file = tokens_file
self.tags_file = tags_file
self.tags = self._load_tags(self.tags_file)
self.vuamc = self._load_vuamc(self.vuamc_file)
self.tokens = self._load_tokens(self.tokens_file)
def _load_tags(self, filename):
"""
Loads the POS Tag CSV file.
"""
data = OrderedDict()
with open(filename) as csvfile:
csvreader = DictReader(csvfile, delimiter=self.delimiter, quotechar=self.quotechar)
for row in csvreader:
txt_id = row['txt_id']
sentence_id = row['sentence_id']
tags = row['sentence_txt'].split(' ')
if txt_id not in data:
data[txt_id] = {}
if txt_id in data and sentence_id in data[txt_id]:
exit('Identical keys in line {}'.format(csvreader.line_num))
else:
data[txt_id][sentence_id] = {}
data[txt_id][sentence_id]['tags'] = tags
return data
def _load_vuamc(self, filename):
"""
Loads the VUAMC CSV file into an OrderedDict.
The final structure is:
self.vuamc['a1h-fragment06']['134']['tokens'][23]
With the corresponding metaphor (0|1) labels:
self.vuamc['a1h-fragment06']['134']['labels'][23]
"""
data = OrderedDict()
with open(filename) as csvfile:
csvreader = DictReader(csvfile, delimiter=self.delimiter, quotechar=self.quotechar)
for row in csvreader:
txt_id = row['txt_id']
sentence_id = row['sentence_id']
sentence_txt = row['sentence_txt']
if txt_id not in data:
data[txt_id] = OrderedDict()
if txt_id in data and sentence_id in data[txt_id]:
exit('Identical keys in line {}'.format(csvreader.line_num))
else:
data[txt_id][sentence_id] = OrderedDict()
tokens = OrderedDict()
labels = OrderedDict()
for token_id, token in enumerate(sentence_txt.strip().split(' ')):
if token.startswith('M_'):
token = token[2:]
labels[token_id+1] = 1
else:
labels[token_id+1] = 0
tokens[token_id+1] = token
data[txt_id][sentence_id]['tokens'] = tokens
data[txt_id][sentence_id]['labels'] = labels
data[txt_id][sentence_id]['tags'] = self.tags[txt_id][sentence_id]['tags']
return data
def _load_tokens(self, filename):
"""
Loads the training gold labels into an OrderedDict.
These are used to yield the (tokens,labels) for the sentences.
The final structure is:
self.tokens['a1h-fragment06']['134'][23]
"""
data = OrderedDict()
with open(filename) as csvfile:
csvreader = reader(csvfile, delimiter=self.delimiter, quotechar=self.quotechar)
for row in csvreader:
txt_id, sentence_id, token_id = row[0].split('_')
if self.mode == 'train':
label = int(row[1])
label = self.vuamc[txt_id][sentence_id]['labels'][int(token_id)]
else:
label = -1
if txt_id not in data:
data[txt_id] = OrderedDict()
if sentence_id not in data[txt_id]:
data[txt_id][sentence_id] = OrderedDict()
if (txt_id in data and sentence_id in data[txt_id] and int(token_id) in data[txt_id][sentence_id]):
exit('Identical keys in line {}'.format(csvreader.line_num))
data[txt_id][sentence_id][int(token_id)] = label
return data
def validate_corpus(self):
"""
Check that the 'txt_id, sentence_id, token_id, class_label'-s from the csv files match.
Raises AssertionsError if files don't match.
"""
for txt_id in self.tokens:
for sentence_id in self.tokens[txt_id]:
for token_id in self.tokens[txt_id][sentence_id]:
if self.mode == 'train':
assert(self.tokens[txt_id][sentence_id][token_id] ==
self.vuamc[txt_id][sentence_id]['labels'][token_id])
else:
assert(self.vuamc[txt_id][sentence_id]['labels'][token_id] == 0)
def sentence(self, text_id, sentence_id):
"""
Returns a sentence as a list of tuples (token, label) with the label from the self.tokens.
"""
sentence = []
for token_id in self.vuamc[text_id][sentence_id]['tokens'].keys():
if token_id in self.tokens[text_id][sentence_id]:
# Token is labeled as metaphor
label = self.tokens[text_id][sentence_id][token_id]
else:
# Token not a metaphor
label = 0
try:
tag = self.vuamc[text_id][sentence_id]['tags'][token_id]
except IndexError:
tag = 'X'
sentence.append((self.vuamc[text_id][sentence_id]['tokens'][token_id], label, tag))
return sentence
@property
def sentences(self):
"""
Yields a list of all sentences, each sentence a list of tuples (word, label).
Example:
[('Such', 0), ('language', 0), ('focused', 1), ('attention', 0), ('on', 0), ('the', 0), ('individuals', 0)]
"""
def populate_sentences():
"""
Helper to populate sentences.
"""
for text_id in self.tokens:
for sentence_id in self.tokens[text_id]:
yield self.sentence(text_id, sentence_id)
if self._sentences is None:
self._sentences = list(populate_sentences())
return self._sentences
@property
def token_list(self):
"""
Yields a list of all tokens
"""
def populate_tokens():
"""
Turn sentences list into token list
"""
for sentence in self.sentences:
yield [item[0] for item in sentence]
if self._token_list is None:
# Flatten list of lists
self._token_list = list(chain(*list(populate_tokens())))
return self._token_list
@property
def label_list(self):
"""
Yields a list of all labels
"""
def populate_labels():
"""
Turn sentences list into labels list
"""
for sentence in self.sentences:
yield [item[1] for item in sentence]
if self._label_list is None:
# Flatten list of lists
self._label_list = list(chain(*list(populate_labels())))
return self._label_list
@property
def pos_list(self):
"""
Yields a list of all pos tags
"""
def populate_pos():
"""
Turn sentences list into labels list
"""
for sentence in self.sentences:
yield [item[2] for item in sentence]
if self._pos_list is None:
# Flatten list of lists
self._pos_list = list(chain(*list(populate_pos())))
return self._pos_list