-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTCGA_parseSurvival.py
executable file
·140 lines (107 loc) · 4.39 KB
/
TCGA_parseSurvival.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
#!/usr/bin/python
#############
# Amy Olex @ CCTR
# 8/25/2014
#
# Purpose: This Python script takes as input an XML file that was generated by GeneTorrents cgquery program.
# It parses the XML file to extract the meta analysis information out into a tab-delimited file for easy reading.
# Meta analysis information includes the reference genome used for alignement, the alignment software such as BWA,
# and any other processing tools such as samtools that were used to obtain the final BAM files. This script also
# parses out whether or not the BAM file contains unmapped reads, if it marks duplicates and if in includes failed reads.
#
# 8/27/2014
# Specilazation: This script is a spin off of TCGA_parseMetadata.py. It is specialized to parse TCGA XML clinical files
# for survivial data only.
###############
# Import modules
import sys, getopt
import subprocess, shlex, shutil
import os, glob
import xml.etree.ElementTree as ET
def parse_and_get_ns(file):
events = "start", "start-ns"
root = None
ns = {}
for event, elem in ET.iterparse(file, events):
if event == "start-ns":
if elem[0] in ns and ns[elem[0]] != elem[1]:
# NOTE: It is perfectly valid to have the same prefix refer
# to different URI namespaces in different parts of the
# document. This exception serves as a reminder that this
# solution is not robust. Use at your own peril.
raise KeyError("Duplicate prefix with different URI found.")
ns[elem[0]] = "{%s}" % elem[1]
elif event == "start":
if root is None:
root = elem
return ET.ElementTree(root), ns
# 3 spaces indent for main method
def main(argv):
# Parse input arguments
try:
opts, args = getopt.getopt(argv,"hi:o:",["idir=","ofile="])
except getopt.GetoptError:
print 'TCGA_parseMetadata.py -i <inputdirectory> -o <outputfile>'
sys.exit(2)
for opt, arg in opts:
if opt == '-h':
print 'TCGA_parseMetadata.py -i <inputdirectory> -o <outputfile>'
sys.exit()
elif opt in ("-i", "--idir"):
inputdirectory = arg
elif opt in ("-o", "--ofile"):
outputfile = arg
print 'Input directory is "', inputdirectory
print 'Output file is "', outputfile
# Get list of XML files to process
findcommand = "find %s -iname *clinical.TCGA-*.xml"%(inputdirectory)
proc = subprocess.Popen(findcommand, shell=True, stdout=subprocess.PIPE)
files = proc.communicate()[0]
filelist = files.rstrip().lstrip().split("\n")
print "Parsing %i files."%(len(filelist))
for inputfile in filelist:
# Parse the names spaces and XML tree
tree, ns = parse_and_get_ns(inputfile)
# Get root of tree
root = tree.getroot()
# Open the output file
out = open(outputfile, "w")
out.write('Barcode\tdays_to_initial_pathological_diagnosis\tdays_to_death\t\n')
# Get the patient barcode
info = root.getiterator(ns['shared']+"bcr_patient_barcode")[0].text + "\t"
# Get days_to_initial_pathologic_diagnosis
info = info + root.getiterator(ns['shared']+"days_to_initial_pathologic_diagnosis")[0].text + "\t"
# Get days_to_death
death=root.getiterator(ns['shared']+"days_to_death")
if death[0].text is None:
info = info + "NA" + "\t"
else:
info = info + death[0].text + "\t"
# Get days_to_birth
birth = root.getiterator(ns['shared']+"days_to_birth")[0].text
info = info + root.getiterator(ns['shared']+"days_to_birth")[0].t
info = info + "\n"
out.write(info)
# END for inputfile in filelist
#close the output file
out.close()
print 'DONE!'
if __name__ == "__main__":
main(sys.argv[1:])
def parse_and_get_ns(file):
events = "start", "start-ns"
root = None
ns = {}
for event, elem in ET.iterparse(file, events):
if event == "start-ns":
if elem[0] in ns and ns[elem[0]] != elem[1]:
# NOTE: It is perfectly valid to have the same prefix refer
# to different URI namespaces in different parts of the
# document. This exception serves as a reminder that this
# solution is not robust. Use at your own peril.
raise KeyError("Duplicate prefix with different URI found.")
ns[elem[0]] = "{%s}" % elem[1]
elif event == "start":
if root is None:
root = elem
return ET.ElementTree(root), ns