-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathremoveNedOutColumnsFromMsFile.py
89 lines (80 loc) · 3.1 KB
/
removeNedOutColumnsFromMsFile.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
#!/usr/bin/env python
import sys, gzip
msFile = sys.argv[1]
def isSnp(samples, i):
alleleH = {}
for j in range(len(samples)):
if samples[j][i] != "N":
alleleH[samples[j][i]] = 1
return len(alleleH.keys()) > 1
def processSimulation(samples, positions):
newPositions = []
newSamples = [""]*len(samples)
if positions:
for i in range(len(samples[0])):
if isSnp(samples, i):
newPositions.append(positions[i])
for j in range(len(samples)):
newSamples[j] += samples[j][i]
if len(newPositions) > 0:
newPositionsStr = "positions: " + " ".join([str(newPositions[x]) for x in range(len(newPositions))])
else:
newPositionsStr = ""
newSamplesStr = "\n".join([str(newSamples[x]) for x in range(len(newSamples))])
print "\n//\nsegsites: %s\n%s\n%s" %(len(newPositions), newPositionsStr, newSamplesStr)
else:
print "\n//\nsegsites: 0\n"
if msFile == "stdin":
isFile = False
msStream = sys.stdin
else:
isFile = True
if msFile.endswith(".gz"):
msStream = gzip.open(msFile)
else:
msStream = open(msFile)
header = msStream.readline()
program,numSamples,numSims = header.strip().split()[:3]
numSamples,numSims = int(numSamples),int(numSims)
processedSims = 0
#advance to first simulation
line = msStream.readline().strip()
print header.strip()
while not line.strip().startswith("//"):
if line.strip() != "":
print line.strip()
line = msStream.readline()
while line:
if not line.strip().startswith("//"):
sys.exit("Malformed ms-style output file: read '%s' instead of '//'. AAAARRRRGGHHH!!!!!\n" %(line.strip()))
segsitesBlah,segsites = msStream.readline().strip().split()
segsites = int(segsites)
if segsitesBlah != "segsites:":
sys.exit("Malformed ms-style output file. AAAARRRRGGHHH!!!!!\n")
positionsLine = msStream.readline().strip()
if not positionsLine.startswith("positions:"):
if segsites == 0:
processSimulation([], [])
else:
sys.exit("Malformed ms-style output file. AAAARRRRGGHHH!!!!!\n")
else:
positionsLine = positionsLine.split()
positions = [float(x) for x in positionsLine[1:]]
samples = []
for i in range(numSamples):
sampleLine = msStream.readline().strip()
if len(sampleLine) != segsites:
sys.exit("Malformed ms-style output file %s segsites but %s columns in line: %s; line %s of %s samples AAAARRRRGGHHH!!!!!\n" %(segsites,len(sampleLine),sampleLine,i,numSamples))
samples.append(sampleLine)
if len(samples) != numSamples:
raise Exception
processSimulation(samples,positions)
processedSims += 1
line = msStream.readline()
#advance to the next non-empty line or EOF
while line and line.strip() == "":
line = msStream.readline()
if processedSims != numSims:
sys.exit("Malformed ms-style output file: %s of %s sims processed. AAAARRRRGGHHH!!!!!\n" %(processedSims,numSims))
if isFile:
msStream.close()