-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathxml2conll.py
140 lines (110 loc) · 4.21 KB
/
xml2conll.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
from xml.dom import minidom
import argparse
import nltk
#! Create file
def bio_formatting(word,tags,entities):
if word in entities:
for value in tags.values():
if type(value[1]) == str and value[1] == word:
return [word + ' I-' + value[0] + '\n', False]
elif type(value[1]) == list:
if word == value[1][0]:
line = ''
counter = 0
category = value[0]
for item in value[1]:
if counter == 0:
line += item + ' B-' + category + '\n'
counter += 1
else:
line += item + ' I-' + category + '\n'
return [line, len(value[1])]
return [(word + ' 0' + '\n'), False]
else:
return [(word + ' 0' + '\n'), False]
def write_line(output,line):
output.write(line)
def create_lines(words,tags,entities):
lines = []
counter = 0
for token in words:
if token == '_____________________________________________':
continue
elif counter != 0:
counter -= 1
continue
else:
line,skip = bio_formatting(token,tags,entities)[0], bio_formatting(token,tags,entities)[1]
if skip != False:
counter = skip
lines.append(line)
counter -= 1
else:
lines.append(line)
return lines
def write_to_file(doc_text,tags,entities,output_path,filename):
new_file = output_path + '/' + filename.split('/')[-1]
new_file = new_file[0:-3] + 'conll'
with open(new_file, 'a') as output:
word_list = nltk.word_tokenize(doc_text)
lines = create_lines(word_list, tags,entities)
for line in lines:
write_line(output, line)
def convert_xml_to_conll(file, output_path):
parser = argparse.ArgumentParser(description='XML to Conll converter')
parser.add_argument('--input', type=str, default=file,
help='The XML file to convert')
parser.add_argument('--output', type=str,
help='The output CONLL file name')
parser.add_argument('--csv', action='store_true',
help='Convert to CSV instead of CONLL')
args = parser.parse_args()
# Read XML input file
xmldoc = minidom.parse(args.input)
docs = xmldoc.getElementsByTagName('TAGS')
tags_element = False # TAGS element
doc_element = False # DOC element
for item in xmldoc.childNodes[0].childNodes:
if item.nodeType == 1:
if item.tagName == 'TAGS':
tags_element = item
elif item.tagName == 'TEXT' or item.tagName == 'DOC':
doc_element = item
#! TAGS Tag
tags_elements = [] # list w/ all tags inside TAGS tag
# filter out text tags
for element in tags_element.childNodes:
if element.nodeType == 1:
if element.tagName != 'TEXT':
tags_elements.append(element)
tags = {}
for tag in tags_elements:
attributes = tag._attrs
_id, text, _type = attributes['id'].value, attributes['text'].value, attributes['TYPE'].value
if len(text.split()) > 1:
text = text.split(' ')
text = [i for i in text if i]
for i in range(len(text)):
if text[i] == '':
text.pop(i)
continue
if len(text[i]) > 1 and text[i][-1] == ',':
text[i] = text[i][0:-1]
text.insert(i+1,',')
tags[_id] = [_type, text]
#! Text from DOC Tag
doc_text = doc_element.childNodes[0].data
#! Create list of entities
entities_list_with_lists = []
for tag_attributes in tags.values():
text = tag_attributes[1]
entities_list_with_lists.append(text)
entities = []
for value in entities_list_with_lists:
if type(value) == list:
for entity in value:
entities.append(entity)
else:
entities.append(value)
entities = list(set(entities))
write_to_file(doc_text,tags,entities,output_path, file)