-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathmergeCollectRnaSeqHistograms.py
executable file
·129 lines (112 loc) · 4.87 KB
/
mergeCollectRnaSeqHistograms.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
#!/usr/bin/python
import sys
import re
import os
import fnmatch
from collections import OrderedDict
#####################################
## Find all count files matching pattern entered by user (there should be one count file
## per sample) and create a matrix with one row for each gene and one column for each sample
#####################################
#####################################
## Usage: /opt/bin/python/ makeCountMatrix.py rootDir patternToSearch outputFileName
## Example: /opt/bin/python makeCountMatrix.py /ifs/res/liang/RNASeq/Proj2983_MassagueJ .htseq_count Proj2983_MassagueJ_htseq.count_allSamples.txt
#####################################
def usage():
print "/opt/bin/python rnaseq_count_matrix.py rootDir patternToSearch outputFileName"
return
## create a list of full paths to files that match pattern entered
def findFiles(rootDir,pattern):
"""
create and return a list of full paths to files that match pattern entered
"""
filepaths = []
for path, dirs, files in os.walk(os.path.abspath(rootDir)):
if fnmatch.filter(files, pattern):
for file in fnmatch.filter(files, pattern):
filepaths.append(os.path.join(path,file))
else:
if "Proj" in path.split("/")[-1] and "-" in path.split("/")[-1]:
print>>sys.stderr, "WARNING: No files matching pattern %s found in %s" %(pattern, path)
return filepaths
def printMatrix(matrix,allSamps,outFile):
"""
"""
header = "\t".join(["position"]+allSamps)
with open(outFile,'w') as out:
print>>out,header
for position in matrix.keys():
for samp in allSamps:
if not samp in matrix[position]:
matrix[position][samp] = 0
print>>out,"\t".join([str(x) for x in [position]+[matrix[position][samp] for samp in allSamps]])
return
def makeMatrix(args):
"""
Find files to parse, create one matrix of all counts and print
matrix to file
"""
if len(args) == 3:
rootDir,filePattern,outFile = args
else:
usage()
sys.exit(1)
## store all values in an ordered dict, keyed by sample
matrix = OrderedDict()
allSamps = []
filePattern = '*'+filePattern
## find all cutadapt stats files using pattern
files = findFiles(rootDir,filePattern)
if files:
print>>sys.stderr, "\nCombining the following files:\n"
for file in files:
if 'Histogram' in file:
continue
print>>sys.stderr, file
if os.stat(file)[6]==0: ## file is empty
print>>sys.stderr, "WARNING: This file is empty!"
else:
with open(file,'r') as fl:
lines = fl.read()
if not "## HISTOGRAM" in lines:
print>>sys.stderr, "WARNING: No histogram in file %s" %file
lines = lines.split("\n")
for x in range(len(lines)):
if lines[x].startswith("## METRICS"):
tmp = dict(zip(lines[x+1].split("\t"),lines[x+2].split("\t")))
samp = tmp["SAMPLE"]
if samp in allSamps:
print "ERROR: sample %s found multiple times!!" %samp
else:
allSamps.append(samp)
position = "0"
count = 0
if not position in matrix:
matrix[position] = {}
if not samp in matrix[position]:
matrix[position][samp] = count
continue
with open(file,'r') as fl:
while not "## HISTOGRAM" in fl.readline():
next
samp = fl.readline().strip().split("\t")[1].split(".normalized_coverage")[0]
if samp in allSamps:
print "ERROR: sample %s found multiple times!!" %samp
continue
else:
allSamps.append(samp)
for line in fl:
if len(line.strip())>0:
cols = line.strip().split("\t")
position = cols[0]
count = cols[1]
if not position in matrix:
matrix[position] = {}
if not samp in matrix[position]:
matrix[position][samp] = count
printMatrix(matrix,allSamps,outFile)
else:
print>>sys.stderr, "\nNo files found matching pattern entered. Exiting.\n"
sys.exit(-1)
if __name__ == '__main__':
makeMatrix(sys.argv[1:])