-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathExtractParsedContext.py
281 lines (219 loc) · 11.6 KB
/
ExtractParsedContext.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
# Copyright (c) 2018
# Amy L. Olex, Virginia Commonwealth University
# alolex at vcu.edu
#
# Luke Maffey, Virginia Commonwealth University
# maffeyl at vcu.edu
#
# Nicholas Morton, Virginia Commonwealth University
# nmorton at vcu.edu
#
# Bridget T. McInnes, Virginia Commonwealth University
# btmcinnes at vcu.edu
#
# This file is part of Chrono
#
# Chrono is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 3
# of the License, or (at your option) any later version.
#
# Chrono is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Chrono; if not, write to
#
# The Free Software Foundation, Inc.,
# 59 Temple Place - Suite 330,
# Boston, MA 02111-1307, USA.
## This program extracts the specified context around each SCATE Entity of a certain type.
## Example Usage: python ExtractParsedContext.py -x data/SemEval-OfficialTrain -i results/newswire.list -t data/SemEval-OfficialTrain -o results/output1-Gold-MinuteOfHour.txt -e Minute-Of-Hour -f gold -c 25
import argparse
from xml.dom import minidom
import os.path
if __name__ == "__main__":
## Parse input arguments
parser = argparse.ArgumentParser(description='Extract entity context information from text using the AnaforaXML Annotation files.')
parser.add_argument('-x', metavar='annodir', type=str, help='path to the input directory that holds the annotated files.', required=True)
parser.add_argument('-F', metavar='annotype', type=str, help='the format of the annotation files. Can be scate, i2b2, or ann. Default is scate.', required=False, default="scate")
parser.add_argument('-i', metavar='filelist', type=str, help='File with list of documents to parse.', required=True)
parser.add_argument('-t', metavar='textfiledir', type=str, help='Path to directory holding the raw text files.', required=True)
parser.add_argument('-E', metavar='textfileext', type=str, help='Extension of the raw text files. Default is blank for no extension. All text files must have the same extension.', required=False, default="")
parser.add_argument('-o', metavar='outputfile', type=str, help='Name of the output file to save results to.', required=True)
parser.add_argument('-e', metavar='entity', type=str, help='The name of the entity we want to extract (for i2b2 this is - TLINK, TIMEX3, EVENT).', required=True)
parser.add_argument('-f', metavar='flag', type=str, help='The type of files being parsed, either gold standard files or chrono annotated files (only matters for SCATE annotations).', required=False, default="chrono")
parser.add_argument('-c', metavar='context', type=str, help='The number of characters before and after for context.', required=False, default=20)
args = parser.parse_args()
## Now we can access each argument as args.i, args.o, args.r
def getTargetSpansXML(xmlfile, entity, filetype):
xmldoc = minidom.parse(xmlfile)
if filetype == "i2b2":
itemlist = xmldoc.getElementsByTagName(entity)
entitylist = []
for item in itemlist:
eid = item.getAttribute('id')
estart = item.getAttribute('start')
eend = item.getAttribute('end')
etype = item.getAttribute('type')
evalue = item.getAttribute('val')
emod = item.getAttribute('mod')
entitylist.append([eid, etype, int(estart), int(eend), emod, evalue])
elif filetype == "scate":
itemlist = xmldoc.getElementsByTagName('entity')
entitylist = []
for item in itemlist:
eid = item.getElementsByTagName('id')[0].firstChild.data
espan = item.getElementsByTagName('span')[0].firstChild.data
etype = item.getElementsByTagName('type')[0].firstChild.data
eproperties = item.getElementsByTagName('properties')
if(len(eproperties[0].getElementsByTagName('Number')) == 1):
tmp = eproperties[0].getElementsByTagName('Number')[0].firstChild
if tmp is not None:
enumber = eproperties[0].getElementsByTagName('Number')[0].firstChild.data
else:
enumber = "None"
else:
enumber = ""
if(len(eproperties[0].getElementsByTagName('Value')) == 1):
tmp = eproperties[0].getElementsByTagName('Value')[0].firstChild
if tmp is not None:
evalue = eproperties[0].getElementsByTagName('Value')[0].firstChild.data
else:
evalue = "None"
elif(len(eproperties[0].getElementsByTagName('Type')) == 1):
tmp = eproperties[0].getElementsByTagName('Type')[0].firstChild
if tmp is not None:
evalue = eproperties[0].getElementsByTagName('Type')[0].firstChild.data
else:
evalue = "None"
else:
evalue = ""
if etype == entity:
start, end = espan.split(",")
entitylist.append([eid, etype, int(start), int(end), enumber, evalue])
return(entitylist)
def writeTargetSpansXML(infile, entitylist, context, outfile):
linestring = open(infile, 'r').read()
term_set = set()
print(entitylist)
for entity in entitylist:
start = max(0,int(entity[2])-context)
end = min(len(linestring), int(entity[3])+context)
if context > 0:
outfile.write("\n\nID: " + entity[0] + ", Type: " + entity[1] + ", Span: (" + str(entity[2]) + "," + str(entity[3]) +
"), Raw Token: " + linestring[entity[2]:entity[3]] + ", Value: " + str(entity[5]) + ", Number/i2b2Modifier: " + entity[4])
outfile.write("\n" + linestring[start:end])
else:
term_set = term_set.union({linestring[start:end].lower()})
return(term_set)
def getTargetSpansANN(annfile, entity):
with open(annfile) as file:
content = file.readlines()
content = [x.strip() for x in content]
entitylist = []
for line in content:
fields1 = line.split('\t')
#if 2 we have a relation
#if 3 need to split out the middle entry by spaces
fields2 = fields1[1].split(' ')
if len(fields1) == 2 and len(fields2) == 3:
eid = fields1[0]
etype = fields2[0]
estart = fields2[1]
eend = fields2[2]
etoken = ""
elif len(fields1) == 3 and len(fields2) == 3:
eid = fields1[0]
etype = fields2[0]
estart = fields2[1]
eend = fields2[2]
etoken = fields1[2]
elif len(fields1) == 3 and len(fields2) == 4:
eid = fields1[0]
etype = fields2[0]
estart = fields2[1]
eend = fields2[3]
etoken = fields1[2]
else:
print("Error, unrecognized number of fields")
return(0)
if etype == entity:
entitylist.append([eid, etype, int(estart), int(eend), etoken])
return(entitylist)
def writeTargetSpansANN(infile, entitylist, context, outfile):
linestring = open(infile, 'r').read()
term_set = set()
for entity in entitylist:
start = max(0,int(entity[2])-context)
end = min(len(linestring), int(entity[3])+context)
if context > 0:
outfile.write("\n\nID: " + entity[0] + ", Type: " + entity[1] + ", Span: (" + str(entity[2]) + "," + str(entity[3]) + "), Raw Token: " + linestring[entity[2]:entity[3]] + ", Listed Token: " + entity[4])
outfile.write("\n" + linestring[start:end])
else:
term_set = term_set.union({linestring[start:end].lower()})
return(term_set)
############### Start Main Method ######################
## Loop over each file in the file list and parse it
out = open(args.o, 'w')
inputfiles = open(args.i, 'r').read().split("\n")
terms = set()
if args.F == "SCATE" or args.F == "i2b2":
for f in inputfiles:
## Open the XML file and parse it
if args.f == "gold" and args.F == "scate":
path = args.x + "/" + f + "/" + f + ".TimeNorm.gold.completed.xml"
path2 = args.t + "/" + f + "/" + f + args.E
elif args.f == "chrono" and args.F == "scate":
path = args.x + "/" + f + "/" + f + ".completed.xml"
path2 = args.t + "/" + f + "/" + f + args.E
elif args.F == "i2b2":
path = args.x + "/" + f
path2 = args.t + "/" + f + args.E
print(path2)
if(os.path.isfile(path)):
myElist = getTargetSpansXML(path, args.e, args.F)
print(myElist)
## Pass this information to extract the text segments and write to file
if(os.path.isfile(path2)):
print("HERE")
if int(args.c) > 0:
out.write("\n\n*****\nFile: " + f)
tmp_terms = writeTargetSpansXML(path2, myElist, int(args.c), out)
#print("my tmp_terms: " + str(tmp_terms))
terms = terms.union(tmp_terms)
#print("my terms: " + str(terms))
else:
if int(args.c) > 0:
out.write("\n\n*****\nSkipping File: " + f)
if int(args.c) == 0:
for t in sorted(terms):
out.write("\n" + t)
elif args.F == "ann":
for f in inputfiles:
## Open the XML file and parse it
if args.f == "gold":
path = args.x + "/" + f + ".ann"
else:
path = args.x + "/" + f + ".ann"
if(os.path.isfile(path)):
myElist = getTargetSpansANN(path, args.e)
## Pass this information to extract the text segments and write to file
path2 = args.t + "/" + f + args.E
if(os.path.isfile(path2)):
if int(args.c) > 0:
out.write("\n\n*****\nFile: " + f)
tmp_terms = writeTargetSpansANN(path2, myElist, int(args.c), out)
#print("my tmp_terms: " + str(tmp_terms))
terms = terms.union(tmp_terms)
#print("my terms: " + str(terms))
else:
if int(args.c) > 0:
out.write("\n\n*****\nSkipping File: " + f)
if int(args.c) == 0:
for t in sorted(terms):
out.write("\n" + t)
out.close()
print("Completed!")