-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTCGA_parseAllClinical.py
executable file
·319 lines (267 loc) · 13.1 KB
/
TCGA_parseAllClinical.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
#!/usr/bin/python
#############
# Amy Olex @ CCTR
# 8/25/2014
#
# Purpose: This Python script takes as input an XML file that was generated by GeneTorrents cgquery program.
# It parses the XML file to extract the meta analysis information out into a tab-delimited file for easy reading.
# Meta analysis information includes the reference genome used for alignement, the alignment software such as BWA,
# and any other processing tools such as samtools that were used to obtain the final BAM files. This script also
# parses out whether or not the BAM file contains unmapped reads, if it marks duplicates and if in includes failed reads.
#
# UPDATED: 9/15/14
# Specilazation: This script was modified from the above to parse out ALL clinical fields from the TCGA XML Clinical files.
# The strategy is to put those with the same namespace together.
#
# UPDATED: 2/2/2016
# Updated line 54 so that is is no longer specific to the xml schema version used by TCGA. Further updates on XML schema should nolonger break this script.
###############
# Import modules
import sys, getopt
import subprocess, shlex, shutil
import os, glob
import xml.etree.ElementTree as ET
import re
def parse_and_get_ns(file):
events = "start", "start-ns"
root = None
ns = {}
for event, elem in ET.iterparse(file, events):
if event == "start-ns":
if elem[0] in ns and ns[elem[0]] != elem[1]:
# NOTE: It is perfectly valid to have the same prefix refer
# to different URI namespaces in different parts of the
# document. This exception serves as a reminder that this
# solution is not robust. Use at your own peril.
raise KeyError("Duplicate prefix with different URI found.")
ns[elem[0]] = "{%s}" % elem[1]
elif event == "start":
if root is None:
root = elem
return ET.ElementTree(root), ns
def parse_tag(tag):
mo = re.match("{http://tcga.nci/bcr/xml/clinical/(.*/).*}(.*)", tag)
new_tag = ''.join(mo.groups()) if mo is not None else ''
return new_tag
# Add 1,2,3,etc to drugs, followups and NTEs and look for others.
# remove tabs between recursive steps
# Elem is a list of xml node objects
def parse_xml_tree_recursive(elem, header, content, parent_tag, flag):
header_return = header
content_return = content
if parent_tag == 'coad/follow_ups':
f = 0
for e in elem:
f = f+1
new_flag = 'FollowupSeq_' + str(f)
e_list = e.getiterator()
#print e.tag + ":\t" + str(len(e_list))
if len(e_list)>1:
#print "Recursive call here"
e_child = e.getchildren()
#print str(len(e_child))
header_return, content_return = parse_xml_tree_recursive(e_child, header_return, content_return, parse_tag(e.tag), new_flag)
# end for e in elem
return header_return, content_return
elif parent_tag == 'pharmaceutical/drugs':
f = 0
for e in elem:
f = f+1
new_flag = 'DrugSeq_' + str(f)
e_list = e.getiterator()
#print e.tag + ":\t" + str(len(e_list))
if len(e_list)>1:
#print "Recursive call here"
e_child = e.getchildren()
#print str(len(e_child))
header_return, content_return = parse_xml_tree_recursive(e_child, header_return, content_return, parse_tag(e.tag), new_flag)
# end for e in elem
return header_return, content_return
elif parent_tag == 'radiation/radiations':
f = 0
for e in elem:
f = f+1
new_flag = 'RadSeq_' + str(f)
e_list = e.getiterator()
#print e.tag + ":\t" + str(len(e_list))
if len(e_list)>1:
#print "Recursive call here"
e_child = e.getchildren()
#print str(len(e_child))
header_return, content_return = parse_xml_tree_recursive(e_child, header_return, content_return, parse_tag(e.tag), new_flag)
# end for e in elem
return header_return, content_return
else:
for e in elem:
e_list = e.getiterator()
#print e.tag + ":\t" + str(len(e_list))
if len(e_list)>1:
#print "Recursive call here"
e_child = e.getchildren()
#print str(len(e_child))
header_return, content_return = parse_xml_tree_recursive(e_child, header_return, content_return, parse_tag(e.tag), flag)
else:
if flag != 0:
try:
header_return = header_return + "\t" + str(flag) + "/" + parse_tag(e.tag)
c = e.text if e.attrib['procurement_status']=='Completed' else e.attrib['procurement_status']
content_return = content_return + "\t" + c
#print e.tag + ":" + c
except KeyError:
header_return = header_return + "\t" + str(flag) + "/" + arse_tag(e.tag)
content_return = content_return + "\tField not avaliable"
else:
try:
c = e.text if e.attrib['procurement_status']=='Completed' else e.attrib['procurement_status']
content_return = content_return + "\t" + c
header_return = header_return + "\t" + parse_tag(e.tag)
#print e.tag + ":" + c
except KeyError:
print "KeyError: " + parse_tag(e.tag)
#header_return = header_return + "\t" + parse_tag(e.tag)
#content_return = content_return + "\tField not avaliable"
# end for e in elem
return header_return, content_return
# 3 spaces indent for main method
def main(argv):
# Parse input arguments
try:
opts, args = getopt.getopt(argv,"hi:o:d:",["idir=","ofile=", "odir="])
except getopt.GetoptError:
print 'TCGA_parseMetadata.py -i <inputdirectory> -o <outputfile>'
sys.exit(2)
for opt, arg in opts:
if opt == '-h':
print 'TCGA_parseMetadata.py -i <inputdirectory> -o <outputfile>'
sys.exit()
elif opt in ("-i", "--idir"):
inputdirectory = arg
elif opt in ("-o", "--ofile"):
outputfile = arg
elif opt in ("-d", "--odir"):
outputdir = arg
print 'Input directory is ' + inputdirectory
print 'Output file is ' + outputfile
print 'Output directory is ' + outputdir
# Get list of XML files to process
findcommand = "find %s -iname *clinical.TCGA-*.xml"%(inputdirectory)
proc = subprocess.Popen(findcommand, shell=True, stdout=subprocess.PIPE)
files = proc.communicate()[0]
filelist = files.rstrip().lstrip().split("\n")
print "Parsing %i files."%(len(filelist))
# Open the output file
#out = open(outputfile, "w")
#out.write('Barcode\tdays_to_initial_pathologic_diagnosis\tdays_to_death\tdays_to_birth\tdays_to_last_known_alive\tfollowup_sequence\tvital_status\tdays_to_last_followup\tdays_to_death\tperson_neoplasm_cancer_status\tnew_tumor_event_after_initial_treatment\tdays_to_new_tumor_event_after_initial_treatment\tfollowup_treatment_success\tfollowup_sequence\tvital_status\tdays_to_last_followup\tdays_to_death\tperson_neoplasm_cancer_status\tnew_tumor_event_after_initial_treatment\tdays_to_new_tumor_event_after_initial_treatment\tfollowup_treatment_success\tfollowup_sequence\tvital_status\tdays_to_last_followup\tdays_to_death\tperson_neoplasm_cancer_status\tnew_tumor_event_after_initial_treatment\tdays_to_new_tumor_event_after_initial_treatment\tfollowup_treatment_success\tfollowup_sequence\tvital_status\tdays_to_last_followup\tdays_to_death\tperson_neoplasm_cancer_status\tnew_tumor_event_after_initial_treatment\tdays_to_new_tumor_event_after_initial_treatment\tfollowup_treatment_success\n')
# for each file
# 1-make a new file name to write to
# 2-open the file to read from
# 3-create a header variable
# 4-create an info variable
# 5-parse out the root of the input file
# 6-call the processing function to process the list of elements (this is the recusion step)
# 7-open the file to write to and write the header and info to the outfile.
# 8-close the outfile and the infile
for inputfile in filelist:
# Parse the tree and get the patient barcode
# Parse the names spaces and XML tree
tree, ns = parse_and_get_ns(inputfile)
# Get root of tree
root = tree.getroot()
# Get the patient barcode
barcode = root.getiterator(ns['shared']+"bcr_patient_barcode")[0].text
info = barcode
# make a new file name to write to
new_out_file = outputdir + barcode + "_" + outputfile
# create the header variable
header = ""
# create the content variable
content = barcode
# create the patient list to pass to the recursive routine
patient = root[1].getchildren()
# call the recursive function to parse the tree
new_header, new_content = parse_xml_tree_recursive(patient, header, content, parse_tag(root[1].tag), 0)
# write the new header and new content to a file
out = open(new_out_file, "w")
out.write(new_header)
out.write("\n")
out.write(new_content)
out.write("\n")
out.close()
#
#
#
# # Get days_to_initial_pathologic_diagnosis
# info = info + root.getiterator(ns['shared']+"days_to_initial_pathologic_diagnosis")[0].text + "\t"
#
# # Get days_to_death
# death=root.getiterator(ns['shared']+"days_to_death")
# if death[0].text is None:
# info = info + "NA" + "\t"
# else:
# info = info + death[0].text + "\t"
#
# # Get days_to_birth
# birth = root.getiterator(ns['shared']+"days_to_birth")[0].text
#
# if birth is None:
# info = info + "NA" + "\t"
# else:
# info = info + birth + "\t"
#
# # Get days_to_last_known_alive
# lastalive = root.getiterator(ns['shared']+"days_to_last_known_alive")[0].text
#
# if lastalive is None:
# info = info + "NA" + "\t"
# else:
# info = info + lastalive + "\t"
#
# # For each follow up visit, get the information I need
# try:
# followups = root.getiterator(ns['follow_up_v1.0']+"follow_up")
# for f in followups:
# seq = f.attrib['sequence']
# vital = f.find(ns['shared']+"vital_status").text if f.find(ns['shared']+"vital_status").attrib['procurement_status']=='Completed' else f.find(ns['shared']+"vital_status").attrib['procurement_status']
# last_followup = f.find(ns['shared']+"days_to_last_followup").text if f.find(ns['shared']+"days_to_last_followup").attrib['procurement_status']=='Completed' else f.find(ns['shared']+"days_to_last_followup").attrib['procurement_status']
# days_death = f.find(ns['shared']+"days_to_death").text if f.find(ns['shared']+"days_to_death").attrib['procurement_status']=='Completed' else f.find(ns['shared']+"days_to_death").attrib['procurement_status']
# tumor_status = f.find(ns['shared']+"person_neoplasm_cancer_status").text if f.find(ns['shared']+"person_neoplasm_cancer_status").attrib['procurement_status']=='Completed' else f.find(ns['shared ']+"person_neoplasm_cancer_status").attrib['procurement_status']
# nte = f.getiterator(ns['coad_nte']+"new_tumor_events")[0]
#
# new_event = nte.find(ns['nte']+"new_tumor_event_after_initial_treatment").text if nte.find(ns['nte']+"new_tumor_event_after_initial_treatment").attrib['procurement_status']=='Completed' else nte.find(ns['nte']+"new_tumor_event_after_initial_treatment").attrib['procurement_status']
# days_to_nte = nte.find(ns['nte']+"days_to_new_tumor_event_after_initial_treatment").text if nte.find(ns['nte']+"days_to_new_tumor_event_after_initial_treatment") is not None else "Not in File"
#
# outcome = f.find(ns['shared']+"followup_treatment_success").text if f.find(ns['shared']+"followup_treatment_success").attrib['procurement_status']=='Completed' else f.find(ns['shared']+"followup_treatment_success").attrib['procurement_status']
# alltext = [seq, vital, last_followup, days_death, tumor_status, new_event, days_to_nte, outcome]
# ["Not Avaliable" if v is None else v for v in alltext]
# info = info + "\t".join(alltext) + "\t"
#
# except KeyError:
# info = info + "No Followups Avaliable"
#
# info = info + "\n"
#
# out.write(info)
# END for inputfile in filelist
#close the output file
#out.close()
print 'DONE!'
if __name__ == "__main__":
main(sys.argv[1:])
#
#def parse_and_get_ns(file):
# events = "start", "start-ns"
# root = None
# ns = {}
# for event, elem in ET.iterparse(file, events):
# if event == "start-ns":
# if elem[0] in ns and ns[elem[0]] != elem[1]:
# # NOTE: It is perfectly valid to have the same prefix refer
# # to different URI namespaces in different parts of the
# # document. This exception serves as a reminder that this
# # solution is not robust. Use at your own peril.
# raise KeyError("Duplicate prefix with different URI found.")
# ns[elem[0]] = "{%s}" % elem[1]
# elif event == "start":
# if root is None:
# root = elem
# return ET.ElementTree(root), ns