-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathChrono.py
241 lines (202 loc) · 11.8 KB
/
Chrono.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
# Copyright (c) 2018
# Amy L. Olex, Virginia Commonwealth University
# alolex at vcu.edu
#
# Luke Maffey, Virginia Commonwealth University
# maffeyl at vcu.edu
#
# Nicholas Morton, Virginia Commonwealth University
# nmorton at vcu.edu
#
# Bridget T. McInnes, Virginia Commonwealth University
# btmcinnes at vcu.edu
#
# This file is part of Chrono
#
# Chrono is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 3
# of the License, or (at your option) any later version.
#
# Chrono is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Chrono; if not, write to
#
# The Free Software Foundation, Inc.,
# 59 Temple Place - Suite 330,
# Boston, MA 02111-1307, USA.
## This is the main driver program that runs Chrono.
import argparse
import os
import pickle
import inspect
from chronoML import DecisionTree as DTree
from chronoML import RF_classifier as RandomForest
from chronoML import NB_nltk_classifier as NBclass, ChronoKeras
from chronoML import SVM_classifier as SVMclass
from Chrono import BuildEntities
from Chrono import referenceToken
from Chrono import utils
from transformers import BertModel, BertTokenizer
from tensorflow.keras.models import load_model
from joblib import load
debug=False
## This is the driver method to run all of Chrono.
# @param INDIR The location of the directory with all the files in it.
# @param OUTDIR The location of the directory where you want all the output written.
# @param REFDIR The location of the gold standard XML files for evaluation.
# @return Anafora formatted XML files, one directory per file with one XML file in each directory.
# @return The precision and recall from comparing output results to the gold standard.
if __name__ == "__main__":
## Parse input arguments
parser = argparse.ArgumentParser(description='Parse a directory of files to identify and normalize temporal information.')
parser.add_argument('-i', metavar='inputdir', type=str, help='path to the input directory.', required=False, default=None)
parser.add_argument('-I', metavar='i2b2inputdir', type=str, help='path to the i2b2 input directory.', required=False, default=None)
parser.add_argument('-x', metavar='fileExt', type=str, help='input file extension if exists. Default is and empty string', required=False, default="")
parser.add_argument('-o', metavar='outputdir', type=str, help='path to the output directory.', required=False, default=None)
parser.add_argument('-O', metavar='i2b2outdir', type=str, help='The path to the i2b2 XML output directory.', required=False, default=None)
parser.add_argument('-m', metavar='MLmethod', type=str, help='The machine learning method to use. Must be one of NN (neural network), DT (decision tree), SVM (support vector machine), NB (naive bayes, default).', required=False, default='NB')
parser.add_argument('-w', metavar='windowSize', type=str, help='An integer representing the window size for context feature extraction. Default is 3.', required=False, default=3)
parser.add_argument('-d', metavar='MLTrainData', type=str, help='A string representing the file name that contains the CSV file with the training data matrix.', required=False, default=False)
parser.add_argument('-c', metavar='MLTrainClass', type=str, help='A string representing the file name that contains the known classes for the training data matrix.', required=False, default=False)
parser.add_argument('-M', metavar='MLmodel', type=str, help='The path and file name of a pre-build ML model for loading.', required=False, default=None)
parser.add_argument('-b', metavar='BERTmodel', type=str,
help='The path and file name of a pre-built BERT model for loading.', required=False,
default=None)
parser.add_argument('-B', metavar='BERTClassificationModel', type=str,
help='The path and file name of a pre-trained SVM or CNN classification model from ChronoBERT.', required=False,
default=None)
#parser.add_argument('-r',metavar='includeRelative', type=str2bool, help='Tell Chrono to mark relative phrases temporal words as temporal.', action="store_true", default=False)
parser.add_argument('--includeRelative', action="store_true", default=False)
parser.add_argument('--includeContext', action="store_true", default=False)
parser.add_argument('--includeAttention', action="store_true", default=False)
parser.add_argument('--cnn', action="store_true", default=False)
args = parser.parse_args()
## Now we can access each argument as args.i, args.o, args.r
#### need to check for input and output of one type here.
global dictpath
thisfilename = inspect.getframeinfo(inspect.currentframe()).filename
thispath = os.path.dirname(os.path.abspath(thisfilename))
dictpath = os.path.join(thispath,"dictionary")
print("The dictionary path: " + str(dictpath))
## Get list of folder names in the input directory
indirs = []
infiles = []
outfiles = []
if args.O is not None:
for root, dirs, files in os.walk(args.I, topdown = True):
files.sort()
print("FILELIST: " + str(files))
for name in files:
indirs.append(os.path.join(args.I))
infiles.append(os.path.join(args.I,name))
outfiles.append(os.path.join(args.O,name))
if not os.path.exists(os.path.join(args.O)):
os.makedirs(os.path.join(args.O))
else:
for root, dirs, files in os.walk(args.i, topdown = True):
for name in dirs:
indirs.append(os.path.join(root, name))
infiles.append(os.path.join(root,name,name))
outfiles.append(os.path.join(args.o,name,name))
if not os.path.exists(os.path.join(args.o,name)):
os.makedirs(os.path.join(args.o,name))
## Get training data for ML methods by importing pre-made boolean matrix
## Train ML methods on training data
if(args.m == "DT" and args.M is None):
## Train the decision tree classifier and save in the classifier variable
#print("Got DT")
classifier, feats = DTree.build_dt_model(args.d, args.c)
with open('DT_model.pkl', 'wb') as mod:
pickle.dump([classifier, feats], mod)
if(args.m == "RF" and args.M is None):
## Train the decision tree classifier and save in the classifier variable
# print("Got RF")
classifier, feats = RandomForest.build_model(args.d, args.c)
with open('RF_model.pkl', 'wb') as mod:
pickle.dump([classifier, feats], mod)
elif(args.m == "NN" and args.M is None):
#print("Got NN")
## Train the neural network classifier and save in the classifier variable
classifier = ChronoKeras.build_model(args.d, args.c)
feats = utils.get_features(args.d)
classifier.save('NN_model.h5')
elif(args.m == "SVM" and args.M is None):
#print("Got SVM")
## Train the SVM classifier and save in the classifier variable
classifier, feats = SVMclass.build_model(args.d, args.c)
with open('SVM_model.pkl', 'wb') as mod:
pickle.dump([classifier, feats], mod)
elif(args.M is None):
#print("Got NB")
## Train the naive bayes classifier and save in the classifier variable
classifier, feats, NB_input = NBclass.build_model(args.d, args.c)
classifier.show_most_informative_features(20)
with open('NB_model.pkl', 'wb') as mod:
pickle.dump([classifier, feats], mod)
elif(args.M is not None):
#print("use saved model")
if args.m == "NB" or args.m == "DT":
with open(args.M, 'rb') as mod:
print(args.M)
classifier, feats = pickle.load(mod)
elif args.m == "NN":
classifier = load_model(args.M)
feats = utils.get_features(args.d)
## Pass the ML classifier through to the parse SUTime entities method.
# load in BERT model
bert_model = BertModel.from_pretrained(args.b, output_hidden_states=True, use_cache=True, output_attentions=True)
bert_tokenizer = BertTokenizer.from_pretrained(args.b)
if args.cnn:
bert_classifier = load_model(args.B)
else:
bert_classifier = load(args.B)
## Loop through each file and parse
for f in range(0,len(infiles)) :
print("Parsing "+ infiles[f] +" ...")
## Init the ChronoEntity list
my_chronoentities = []
my_chrono_ID_counter = 1
## parse out the doctime
if args.I is not None:
doctime = utils.getDocTime(infiles[f], i2b2=True)
else:
doctime = utils.getDocTime(infiles[f] + ".dct", i2b2=False)
if(debug) : print(doctime)
## parse out reference tokens. The spans returned are character spans, not token spans.
## sents is per token, a 1 indicates that token is the last in the sentence.
##
raw_text, text, tokens, abs_text_spans, rel_text_spans, tags, sents, sent_text, sent_membership = utils.getWhitespaceTokens2(infiles[f]+args.x)
#my_refToks = referenceToken.convertToRefTokens(tok_list=tokens, span=spans, remove_stopwords="./Chrono/stopwords_short2.txt")
my_refToks = referenceToken.convertToRefTokens(tok_list=tokens, abs_span=abs_text_spans, rel_span=rel_text_spans, pos=tags, sent_boundaries=sents, sent_membership=sent_membership)
if(args.includeRelative):
print("Including Relative Terms")
## mark all ref tokens if they are numeric or temporal
chroList = utils.markTemporal(my_refToks, include_relative=args.includeRelative)
if(debug) :
print("REFERENCE TOKENS:\n")
for tok in chroList : print(tok)
tempPhrases = utils.getTemporalPhrases(chroList, sent_text, doctime)
if(debug):
for c in tempPhrases:
print(c)
chrono_master_list, my_chrono_ID_counter, timex_phrases = BuildEntities.buildChronoList(tempPhrases,
my_chrono_ID_counter,
chroList,
(classifier, args.m),
feats, bert_model,
bert_tokenizer,
bert_classifier,
args.includeContext,
args.includeAttention,
args.cnn,
doctime)
print("Number of Chrono Entities: " + str(len(chrono_master_list)))
if args.O is not None:
utils.write_i2b2(raw_text, timex_phrases, outfile=outfiles[f])
else:
utils.write_xml(chrono_list=chrono_master_list, outfile=outfiles[f])