TCGA_parseSurvival.py

#!/usr/bin/python

#############
# Amy Olex @ CCTR
# 8/25/2014
#
# Purpose: This Python script takes as input an XML file that was generated by GeneTorrents cgquery program.
# It parses the XML file to extract the meta analysis information out into a tab-delimited file for easy reading.
# Meta analysis information includes the reference genome used for alignement, the alignment software such as BWA,
# and any other processing tools such as samtools that were used to obtain the final BAM files.  This script also
# parses out whether or not the BAM file contains unmapped reads, if it marks duplicates and if in includes failed reads.
# 
# 8/27/2014
# Specilazation:  This script is a spin off of TCGA_parseMetadata.py.  It is specialized to parse TCGA XML clinical files
# for survivial data only.
###############


# Import modules
import sys, getopt
import subprocess, shlex, shutil
import os, glob
import xml.etree.ElementTree as ET


def parse_and_get_ns(file):
   events = "start", "start-ns"
   root = None
   ns = {}
   for event, elem in ET.iterparse(file, events):
     if event == "start-ns":
       if elem[0] in ns and ns[elem[0]] != elem[1]:
         # NOTE: It is perfectly valid to have the same prefix refer
         #     to different URI namespaces in different parts of the
         #     document. This exception serves as a reminder that this
         #     solution is not robust.    Use at your own peril.
         raise KeyError("Duplicate prefix with different URI found.")
       ns[elem[0]] = "{%s}" % elem[1]
     elif event == "start":
       if root is None:
         root = elem
   return ET.ElementTree(root), ns


# 3 spaces indent for main method
def main(argv):
   
   # Parse input arguments
   try:
      opts, args = getopt.getopt(argv,"hi:o:",["idir=","ofile="])
   except getopt.GetoptError:
      print 'TCGA_parseMetadata.py -i <inputdirectory> -o <outputfile>'
      sys.exit(2)
   for opt, arg in opts:
      if opt == '-h':
         print 'TCGA_parseMetadata.py -i <inputdirectory> -o <outputfile>'
         sys.exit()
      elif opt in ("-i", "--idir"):
         inputdirectory = arg
      elif opt in ("-o", "--ofile"):
         outputfile = arg

   print 'Input directory is "', inputdirectory
   print 'Output file is "', outputfile

   # Get list of XML files to process
   findcommand = "find %s -iname *clinical.TCGA-*.xml"%(inputdirectory)
   proc = subprocess.Popen(findcommand, shell=True, stdout=subprocess.PIPE)
   files = proc.communicate()[0]
   filelist = files.rstrip().lstrip().split("\n")
   print "Parsing %i files."%(len(filelist))

   
   for inputfile in filelist:
      # Parse the names spaces and XML tree
      tree, ns = parse_and_get_ns(inputfile)        

      # Get root of tree
      root = tree.getroot()

      # Open the output file
      out = open(outputfile, "w")
      out.write('Barcode\tdays_to_initial_pathological_diagnosis\tdays_to_death\t\n')


      # Get the patient barcode
      info = root.getiterator(ns['shared']+"bcr_patient_barcode")[0].text + "\t"

      # Get days_to_initial_pathologic_diagnosis
      info = info + root.getiterator(ns['shared']+"days_to_initial_pathologic_diagnosis")[0].text + "\t"

      # Get days_to_death
      death=root.getiterator(ns['shared']+"days_to_death")
      if death[0].text is None:
         info = info + "NA" + "\t"
      else:
         info = info + death[0].text + "\t"

      #	Get days_to_birth
      birth = root.getiterator(ns['shared']+"days_to_birth")[0].text
      info = info + root.getiterator(ns['shared']+"days_to_birth")[0].t

      info = info + "\n"
      out.write(info)
   # END for inputfile in filelist

   #close the output file
   out.close()

   print 'DONE!'


if __name__ == "__main__":
   main(sys.argv[1:])

def parse_and_get_ns(file):
   events = "start", "start-ns"
   root = None
   ns = {}
   for event, elem in ET.iterparse(file, events):
     if event == "start-ns":
       if elem[0] in ns and ns[elem[0]] != elem[1]:
         # NOTE: It is perfectly valid to have the same prefix refer
         #     to different URI namespaces in different parts of the
         #     document. This exception serves as a reminder that this
         #     solution is not robust.    Use at your own peril.
         raise KeyError("Duplicate prefix with different URI found.")
       ns[elem[0]] = "{%s}" % elem[1]
     elif event == "start":
       if root is None:
         root = elem
   return ET.ElementTree(root), ns