-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathxml2ann.py
168 lines (129 loc) · 6.58 KB
/
xml2ann.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
# Copyright (c) 2018
# Amy L. Olex, Virginia Commonwealth University
# alolex at vcu.edu
#
# Luke Maffey, Virginia Commonwealth University
# maffeyl at vcu.edu
#
# Nicholas Morton, Virginia Commonwealth University
# nmorton at vcu.edu
#
# Bridget T. McInnes, Virginia Commonwealth University
# btmcinnes at vcu.edu
#
# This file is part of Chrono
#
# Chrono is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 3
# of the License, or (at your option) any later version.
#
# Chrono is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Chrono; if not, write to
#
# The Free Software Foundation, Inc.,
# 59 Temple Place - Suite 330,
# Boston, MA 02111-1307, USA.
## This program extracts the specified context around each SCATE Entity of a certain type.
## Example Usage: python xml2ann.py -x data/SemEval-OfficialTrain -i results/newswire.list -t data/SemEval-OfficialTrain -o results/entities.ann -f gold
import argparse
from xml.dom import minidom
import os.path
if __name__ == "__main__":
## Parse input arguments
parser = argparse.ArgumentParser(description='Extract entity information from AnaforaXML Annotation files and the raw text, and convert to ANN formatted files. Note this method only converts the entities and currently does not handle relation extraction.')
parser.add_argument('-x', metavar='xmldir', type=str, help='path to the input directory that holds the annotated XML files (must be named using file ID).', required=True)
parser.add_argument('-i', metavar='filelist', type=str, help='List of document IDs to parse.', required=True)
parser.add_argument('-t', metavar='textfiledir', type=str, help='Path to directory holding the raw text files (must be named using file id).', required=True)
parser.add_argument('-E', metavar='textfileext', type=str, help='Extension of the raw text files. Default is blank for no extension. All text files must have the same extension.', required=False, default="")
parser.add_argument('-o', metavar='outdir', type=str, help='Name of the output file directory to save results to. If blank, will save .ann files to same directory as input files.', required=False, default="")
parser.add_argument('-f', metavar='flag', type=str, help='Must be one of (gold, chrono) to identify correct xml file extension.', required=True)
args = parser.parse_args()
## Now we can access each argument as args.i, args.o, args.r
def getTargetSpansXML(xmlfile):
xmldoc = minidom.parse(xmlfile)
itemlist = xmldoc.getElementsByTagName('entity')
entitylist = []
for item in itemlist:
eid = item.getElementsByTagName('id')[0].firstChild.data
espan = item.getElementsByTagName('span')[0].firstChild.data
etype = item.getElementsByTagName('type')[0].firstChild.data
eproperties = item.getElementsByTagName('properties')
if(eproperties and (len(eproperties[0].getElementsByTagName('Number')) == 1)):
tmp = eproperties[0].getElementsByTagName('Number')[0].firstChild
if tmp is not None:
enumber = eproperties[0].getElementsByTagName('Number')[0].firstChild.data
else:
enumber = "None"
else:
enumber = ""
espan_list = espan.split(",")
start = espan_list[0]
end = espan_list[len(espan_list)-1]
entitylist.append([eid, etype, int(start), int(end), enumber])
return(entitylist)
def writeTargetSpansXML(infile, entitylist, outfile):
linestring = open(infile, 'r').read()
term_set = set()
count = 1
for entity in entitylist:
start = int(entity[2])
end = int(entity[3])
outfile.write("\nT" + str(count) + "\t" + entity[1] + " " + str(entity[2]) + " " + str(entity[3]) + "\t" + linestring[entity[2]:entity[3]])
count = count+1
############### Start Main Method ######################
## Loop over each file in the file list and parse it
#out = open(args.o, 'w')
inputfiles = open(args.i, 'r').read().split("\n")
terms = set()
for f in inputfiles:
if args.o == "":
outfile = args.x + "/" + f + "/" + f + ".ann"
if(os.path.isdir(args.x + "/" + f)):
out = open(outfile, 'w')
else:
print("Directory not found, skipping: " + args.x + "/" + f)
continue
else:
outfile = args.o + "_" + f + ".ann"
if(os.path.isdir(args.o)):
out = open(outfile, 'w')
else:
print("Directory not found, skipping: " + args.o + "/" + f)
continue
## Open the XML file and parse it
if args.f == "gold":
path = args.x + "/" + f + "/" + f + ".TimeNorm.gold.completed.xml"
elif args.f == "chrono":
path = args.x + "/" + f + "/" + f + ".completed.xml"
elif args.f == "rel":
path_1 = args.x + "/" + f + "/" + f + ".Temporal-Relation.gold.completed.xml"
path_2 = args.x + "/" + f + "/" + f + ".Temporal-Entity.gold.completed.xml"
if(os.path.isfile(path_1)):
path = path_1
elif(os.path.isfile(path_2)):
path = path_2
else:
print("Error in TLINK path, can't find file: " + path_1)
print("Error in TLINK path, can't find file: " + path_2)
path = path_1
else:
print("Error, invalid option: " + args.f)
exit(1)
if(os.path.isfile(path)):
myElist = getTargetSpansXML(path)
## Pass this information to extract the text segments and write to file
path2 = args.t + "/" + f + "/" + f
if(os.path.isfile(path2)):
writeTargetSpansXML(path2, myElist, out)
else:
print("Error, no file at: " + path2 + " No output.")
else:
print("Error, can't find file: " + path)
out.close()
print("Completed!")