forked from vibansal/ancestry
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathrunancestry.py
316 lines (270 loc) · 16.7 KB
/
runancestry.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
#! /usr/bin/env python2.7
# author Vikas Bansal [email protected]
## TODO
## 1. print # of sites and average read-depth per site in calculateGLL code
## 3. option to flip alleles for - strand SNPs
## 4. option to delete intermediate ancestry files created
## 5. option to match SNPs using rsid or chr:position
## 6. option to read VCF file with one or more individuals
## 7. option to read a file with list of BAM files for sequencing experiment
## 8. option to specify --delta threshold for parsimonious ancestry estimation
## 9. along with admixture estimates: also print delta for each value...
## 10. read plink .bed files binary format for genotypes
## 11. option to do ancestry for only specific samples in ped file
import sys, os, glob, string,math, time
from math import log
from optparse import OptionParser
from subprocess import call
POOLSIZE= 2;
INCLUDE_STRAND_AMBIG = 0;
CORES =1;
WINDOW =0;
PARSIMONY =1;
HWE_CHECK = 0;
OUTPUT_GLL_ONLY = 0;
ADD_chr_names = False;
SORT_SNPS = 1; ## sort the 'forGLL' file before running calculateGLL code, added 07/07/2017
##################################################################################################################################
def output_GLL_inputfile(afmatrixfile,outfile):
File1 = open(outfile,'w');
snp_list = [];
File = open(afmatrixfile);
for line in File:
if line[0] != '#':
snp = line.strip().split();
if not ADD_chr_names : print >>File1, 'SNP',snp[0].strip('chr'),snp[1],snp[3],snp[4],snp[3] + '/' + snp[4],100;
elif 'chr' in snp[0]: print >>File1, 'SNP',snp[0],snp[1],snp[3],snp[4],snp[3] + '/' + snp[4],100;
File.close();
File1.close();
if SORT_SNPS ==1:
call('sort -k 2,2 -k 3,3g ' + outfile + ' > ' + outfile + '.sorted',shell=True);
call('mv ' + outfile + '.sorted ' + outfile,shell=True);
def make_ancestry_inputfile_simple(afmatrixfile,GLLfile,outfile):
File1 = open(outfile,'w');
File = open(GLLfile); GLLs = {};
for line in File: snp= line.strip().split(); GLLs[(snp[POOLSIZE+1].strip('chr'),int(snp[POOLSIZE+2]))] = ' '.join(snp[0:POOLSIZE+1]);
File.close();
File = open(afmatrixfile);
for line in File:
snp = line.strip().split();
if line[0] == '#':
print >>File1, '#GLL',
for i in xrange(5,len(snp)): print >>File1, snp[i],
print >>File1, '\n',
else:
try:
ll = GLLs[(snp[0].strip('chr'),int(snp[1]))];
print >>File1, ll,
for i in xrange(5,len(snp)): print >>File1, snp[i],
print >>File1, '\n',
except KeyError: pass;
File.close();
File1.close();
## function that processes bam file for ancestry calculation
def ancestry_pipeline_bam(matrixfile,bamfile,outfile,executable_dir_path):
print >>sys.stderr, "making input file for getting genotype likelihoods";
output_GLL_inputfile(matrixfile,outfile+'.forGLL');
print >>sys.stderr, "calculating genotype likelihoods for bam file",bamfile,"poolsize is: ",POOLSIZE
if OUTPUT_GLL_ONLY ==1:
call([ executable_dir_path + "/calculateGLL --allsites 1 -p " + `POOLSIZE` + " --bam " + bamfile + " --variants " + outfile +".forGLL" + " --mmq 30 > " + outfile + ".GLL" ],shell=True)
else:
call([ executable_dir_path + "/calculateGLL -p " + `POOLSIZE` + " --bam " + bamfile + " --variants " + outfile +".forGLL" + " --mmq 30 > " + outfile + ".GLL" ],shell=True)
print >>sys.stderr, "making input file for ancestry calculations";
make_ancestry_inputfile_simple(matrixfile,outfile + ".GLL",outfile + ".ancestry.input");
#python runancestry.py ancestry-data/hapmap3.allchroms.shared.matrix.hg19 ancestry-data/DM00231.bwamem.MD.GLL > ancestry-data/DM00231.bwamem.MD.input
print >>sys.stderr, "ancestry admixture calculations for",bamfile,"poolsize is ",POOLSIZE;
call([executable_dir_path + "/ANCESTRY -p " + `POOLSIZE` + " --pr 1 -i " + outfile + ".ancestry.input > " + outfile + '.ancestry.out'],shell=True);
#call(["rm -f " + bamfile + ".GLL"],shell=True);
#call(["rm -f " + bamfile + ".forGLL"],shell=True);
#call(["rm -f " + bamfile + ".ancestry.input"],shell=True);
##################################################################################################################################
## add support for plink ped files for multiple samples or processing a file with list of bamfile names
## function that reads in PLINK ped and map files and determines ancestry for each individual in ped file
def make_ancestry_inputfile_plink(pedfile,mapfile,afmatrixfile,outfile,executable_dir_path,modulo_CORE):
RC = {}; RC['A'] = 'T'; RC['T'] = 'A'; RC['C'] = 'G'; RC['G'] = 'C';
gll_low = -4; gll_high = 0; gll_equal = -0.477121;
## read the allele frequency matrix file into a hashtable once for all samples
print >>sys.stderr, "reading allele frequency file",afmatrixfile;
File = open(afmatrixfile);lines =0; AFtable = {}; headerline = [];
for line in File:
snp = line.strip().split();
if line[0] == '#' or lines ==0: headerline = snp;
elif INCLUDE_STRAND_AMBIG ==1: AFtable[snp[2]] = snp;
elif snp[3] == 'A' and snp[4] == 'T': continue;
elif snp[3] == 'T' and snp[4] == 'A': continue;
elif snp[3] == 'C' and snp[4] == 'G': continue;
elif snp[3] == 'G' and snp[4] == 'C': continue;
else: AFtable[snp[2]] = snp;
lines +=1;
File.close()
## read the map file once so that we have SNP rsid for each SNP
print >>sys.stderr, "reading mapfile",mapfile;
try: File = open(mapfile,'r');
except IOError: print >>sys.stderr, "map file",mapfile,"not found"; return -1;
SNPlist = [];
for line in File:
snp = line.strip().split();
if snp[0][0] == '#': continue; ## BUG if first line has '# chrom', make sure the lines match up to ped file... 10/04/2014
SNPlist.append([snp[1],snp[0],snp[2]]);
File.close();
#######
#### read the ped file line by line and calculate ancestry for each individual
File = open(pedfile,'r'); samples = 0;
for line in File:
if CORES > 1 and samples%CORES != modulo_CORE: samples +=1; continue;
GLLs = {};
genotypes = line.strip().split();
sampleid = genotypes[1]; outfilename = outfile+'.'+sampleid + '.input';
File1 = open(outfilename,'w');
# Family ID Individual ID FatherID MotherID Sex phenotype #SIM_1 SIM_1 0 0 1 -9 C C C C A
print >>File1, '#GLL',
for i in xrange(5,len(headerline)): print >>File1, headerline[i],
print >>File1, '\n',
overlapping_markers =0; missing_markers = 0;
for i in xrange(6,len(genotypes),2):
s = i-6; s /=2;
try:
snp = AFtable[SNPlist[s][0]]; flag = 0;
if genotypes[i] == snp[3] and genotypes[i+1] == snp[3]: print >>File1, gll_high,gll_low,gll_low,
elif genotypes[i] == snp[4] and genotypes[i+1] == snp[4]: print >>File1, gll_low,gll_low,gll_high,
elif genotypes[i] == snp[3] and genotypes[i+1] == snp[4]: print >>File1, gll_low,gll_high,gll_low,
elif genotypes[i+1] == snp[3] and genotypes[i] == snp[4]: print >>File1, gll_low,gll_high,gll_low,
elif genotypes[i+1] == RC[snp[3]] and genotypes[i] == RC[snp[4]]: print >>File1, gll_low,gll_high,gll_low,
elif genotypes[i] == RC[snp[3]] and genotypes[i+1] == RC[snp[4]]: print >>File1, gll_low,gll_high,gll_low,
elif genotypes[i] == RC[snp[4]] and genotypes[i+1] == RC[snp[4]]: print >>File1, gll_low,gll_low,gll_high,
elif genotypes[i] == RC[snp[3]] and genotypes[i+1] == RC[snp[3]]: print >>File1, gll_high,gll_low,gll_low,
else:
flag = 1; missing_markers +=1;
#print i,'missing',snp,SNPlist[s],genotypes[i],genotypes[i+1]
if flag ==0:
for j in xrange(5,len(snp)): print >>File1, snp[j],
print >>File1, '\n',
overlapping_markers +=1;
except KeyError:
#print >>sys.stderr, 'variant not found',SNPlist[s][0];
pass;
File1.close();
print >>sys.stderr, "\n\nancestry admixture calculations for individual:",samples+1,sampleid,'using',overlapping_markers,'markers';
#print >>sys.stderr, "missing",missing_markers,len(genotypes)/2;
if WINDOW ==0: call(["" + executable_dir_path + "/ANCESTRY -p 2 --pr " + `PARSIMONY` + " --HWE " + `HWE_CHECK` + " -i " + outfilename + " > " + outfilename + ".ancestry"],shell=True);
else:
print >>sys.stderr, "calling BFGS method in windows";
call([executable_dir_path + "/ANCESTRY.1 -p 2 --pr " + `PARSIMONY` + " -i " + outfilename + " > " + outfilename + ".ancestry"],shell=True);
#call(["rm -f " + outfilename],shell=True);
#call(["rm -f " + outfilename + " " + outfilename + ".ancestry"],shell=True);
samples +=1;
#sys.exit();
"""
"""
File.close();
####################################################################################################################################################
## genotype file is rsid AG pairs
def make_ancestry_inputfile_rsid(afmatrixfile,GLLfile,outfile):
RC = {}; RC['A'] = 'T'; RC['T'] = 'A'; RC['C'] = 'G'; RC['G'] = 'C';
#gll_low = -3.602; gll_high = -0.000021715; gll_equal = -0.477121255; ## assuming error rate = 1/20000 or 50 errors per million genotypes
gll_low = -4; gll_high = -0.000086868; gll_equal = -0.477121;
#gll_low = -9; gll_high = 0.00000; gll_equal = -0.477121;
column =1;
## read genotype file and store genotypes in hashtable
File1 = open(outfile,'w');
GLLs = {};
File = open(GLLfile);
for line in File:
snp= line.strip().split();
if snp[column] == 'NN' or snp[column] == 'N/N' or snp[column] == 'NoCall' or snp[column] == '--': continue;
if snp[column][1] == '/': genotype = snp[column][0] + snp[column][2];
else: genotype = snp[column];
GLLs[snp[0]] = genotype;
File.close();
lines = 0; valid_snps = 0;
File = open(afmatrixfile);
for line in File:
snp = line.strip().split();
if line[0] == '#' or lines ==0:
print >>File1, '#GLL',
for i in xrange(5,len(snp)): print >>File1, snp[i],
print >>File1, '\n',
## ignore A/T and C/G snps that are strand ambiguous
elif INCLUDE_STRAND_AMBIG ==0 and snp[3] == 'A' and snp[4] == 'T': continue;
elif INCLUDE_STRAND_AMBIG ==0 and snp[3] == 'T' and snp[4] == 'A': continue;
elif INCLUDE_STRAND_AMBIG ==0 and snp[3] == 'C' and snp[4] == 'G': continue;
elif INCLUDE_STRAND_AMBIG ==0 and snp[3] == 'G' and snp[4] == 'C': continue;
elif snp[3] in RC and snp[4] in RC: ## the two alleles are [ACTG], ignore indel D/I alleles
try:
GENOTYPE = GLLs[snp[2]]; flag =0;
if GENOTYPE[0] == snp[3] and GENOTYPE[1] == snp[3]: print >>File1, gll_high,gll_low,gll_low,
elif GENOTYPE[0] == snp[4] and GENOTYPE[1] == snp[4]: print >>File1, gll_low,gll_low,gll_high,
elif GENOTYPE[0] == snp[3] and GENOTYPE[1] == snp[4]: print >>File1, gll_low,gll_high,gll_low,
elif GENOTYPE[1] == snp[3] and GENOTYPE[0] == snp[4]: print >>File1, gll_low,gll_high,gll_low,
elif GENOTYPE[1] == RC[snp[3]] and GENOTYPE[0] == RC[snp[4]]: print >>File1, gll_low,gll_high,gll_low,
elif GENOTYPE[0] == RC[snp[3]] and GENOTYPE[1] == RC[snp[4]]: print >>File1, gll_low,gll_high,gll_low,
elif GENOTYPE[0] == RC[snp[4]] and GENOTYPE[1] == RC[snp[4]]: print >>File1, gll_low,gll_low,gll_high,
elif GENOTYPE[0] == RC[snp[3]] and GENOTYPE[1] == RC[snp[3]]: print >>File1, gll_high,gll_low,gll_low,
else:
#print >>File1, gll_equal,gll_equal,gll_equal,
#print >>sys.stderr, GENOTYPE,snp;
flag = 1;
if flag ==0:
for i in xrange(5,len(snp)): print >>File1, snp[i],
print >>File1, '\n',
valid_snps +=1;
#print >>sys.stderr, snp[2];
except KeyError: pass;
lines +=1;
print >>sys.stderr, "ancestry genotype file has",valid_snps,"markers";
File.close();
File1.close();
def ancestry_pipeline_genotypes(matrixfile,genotype_file,outfile,executable_dir_path):
print >>sys.stderr, "making input file for ancestry calculations";
make_ancestry_inputfile_rsid(matrixfile,genotype_file,genotype_file + ".ancestry.input");
print >>sys.stderr, "ancestry admixture calculations",genotype_file
if WINDOW ==0: call([executable_dir_path + "/ANCESTRY --LRT 7.68 -p 2 --pr 1 -i " + genotype_file + ".ancestry.input > " + outfile],shell=True);
else:
print >>sys.stderr, "calling BFGS method in windows";
call([executable_dir_path + "/ANCESTRY.1 -p 2 --pr 1 -i " + genotype_file + ".ancestry.input > " + outfile],shell=True);
#call([executable_dir_path + "/ANCESTRY -p 2 --pr 1 -i " + genotype_file + ".ancestry.input > " + outfile],shell=True);
#call(["time ./ANCESTRY --ea 1 -p 2 --pr 1 -i " + bamfile + ".ancestry.input > " + outfile],shell=True);
#call(["rm -f " + bamfile + ".ancestry.input"],shell=True);
###########################################################################################################################
parser = OptionParser();
#parser.add_option("--type",dest="type",type="string",help="type of input file: bam/geno/genotype/VCF/ped",default="");
parser.add_option("-b","--bam",dest="BAMfile",type="string",help="input bam file name",default="");
parser.add_option("--geno",dest="genofile",type="string",help="input file name for simple genotype file: rsid AG on each line",default="");
parser.add_option("--vcf","--VCF",dest="vcffile",type="string",help="input file name for VCF file",default="");
parser.add_option("--plink",dest="pedfile",type="string",help="input file name for plink file in ped/map format (provide prefix of ped file name without the .ped)",default="");
parser.add_option("-f","--freq",dest="AFfile",type="string",help="allele frequency file",default="");
parser.add_option("-o","--out",dest="outfile",type="string",help="prefix of output file names",default="iADMIX.out");
parser.add_option("-p","--poolsize",dest="POOLSIZE",type="int",help="pool size for non-diploid samples, default = 2",default=2);
parser.add_option("--pr","--parsimony",dest="pr",type="int",help="parsimonious ancestry estimation 0/1, default = 1",default=1);
parser.add_option("--hwe","--HWE",dest="HWE",type="int",help="HWE deviation estimation 0/1, default = 0",default=0);
parser.add_option("--strand",dest="include_strand_ambiguous",type="int",help="<0/1> include SNPs that are strand ambiguous (A/T, C/G): default = 0 (not included)",default=0);
parser.add_option("--path",dest="path",type="string",help="path to directory with executables ANCESTRY and calculateGLL",default=".");
parser.add_option("-c","--cores",dest="CORES",type="int",help="number of cores for processing in parallel",default=1);
parser.add_option("-m","--modcore",dest="MOD_CORE",type="int",help="0/1/2/3..../CORES-1",default=0);
parser.add_option("-w","--windows",dest="WINDOW",type="int",help="0/1",default=0);
parser.add_option("-g","--gll",dest="GLL_ONLY",type="int",help="0/1",default=0); # only output genotypes likelihoods, no ancestry calculation...
parser.add_option("--addchr", dest="ADD_chr_names", action="store_true", default = False);
(options,args) = parser.parse_args(); POOLSIZE = options.POOLSIZE; CORES = options.CORES; HWE_CHECK = options.HWE; OUTPUT_GLL_ONLY = options.GLL_ONLY;
WINDOW = options.WINDOW;
ADD_chr_names = options.ADD_chr_names;
INCLUDE_STRAND_AMBIG = options.include_strand_ambiguous
PARSIMONY = options.pr;
#file = open(sys.argv[1],'rb'); while True: byte = file.read(1); print ord(byte); sys.exit();
if (options.BAMfile == "" and options.genofile == "" and options.pedfile == "") or options.AFfile == "":
print >>sys.stderr,"\n####program iAdmix for estimation of ancestry using population allele frequencies###\n";
print >>sys.stderr, "python runancestry.py --bam sample.bam --geno sample.genotypes --freq allelefrequency_file --out outputfile --poolsize 2";
print >>sys.stderr, "\nExample: python runancestry.py --freq hapmap3.allchroms.shared.matrix --geno HGDP01254.genotypes --out HGDP12054.ancestry";
print >>sys.stderr, "Example: python runancestry.py --freq hapmap3.allchroms.shared.matrix --bam HGDP01254.sorted.bam --out HGDP12054.ancestry";
print >>sys.stderr, "Example: python runancestry.py --freq hapmap3.allchroms.shared.matrix --plink HGDP01254.genotypes (ped/map) --out HGDP12054.ancestry\n\n";
parser.print_help()
sys.exit();
elif options.pedfile != "":
make_ancestry_inputfile_plink(options.pedfile+'.ped',options.pedfile+'.map',options.AFfile,options.outfile,options.path,options.MOD_CORE);
elif options.BAMfile != "":
if not os.path.exists(options.path + "/ANCESTRY"): print >>sys.stderr, "ANCESTRY executable does not exist, please specify correct path with the executable using --path option\n"; sys.exit();
if not os.path.exists(options.path + "/calculateGLL"): print >>sys.stderr, "calculateGLL executable does not exist, please specify correct path with the executable using --path option\n"; sys.exit();
ancestry_pipeline_bam(options.AFfile,options.BAMfile,options.outfile,options.path);
elif options.genofile != "":
if not os.path.exists(options.path + "/ANCESTRY"): print >>sys.stderr, "ANCESTRY executable does not exist, please specify correct path with the executable using --path option\n"; sys.exit();
ancestry_pipeline_genotypes(options.AFfile,options.genofile,options.outfile,options.path);
########################################################################################################################################