-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathASE_count.py
105 lines (87 loc) · 3.84 KB
/
ASE_count.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
#!/usr/bin/env python
#coding:utf-8
"""
Author: --<Zefeng Wu>
Email: [email protected]
Purpose: Extract allele-specific reads and counts at known SNP sites
Created: 2019年10月17日
"""
import re
import pysam
import argparse
parser = argparse.ArgumentParser(description="""Split bam file and count allelic reads number based on known SNP sites""")
parser.add_argument("-b", dest="bam_file", help="Input bam file (required)", required=True)
parser.add_argument("-s", dest="snp_file", help="Input SNP file (a vcf format is required)", required=True)
parser.add_argument("-m", "--mq", dest="map_qual", help="Set a mapping quality threshold (default:20)", default=20)
parser.add_argument("--bq", dest="base_qual", help="Set a base quality threshold (default:20)", default=20)
args = parser.parse_args()
snp_file = vars(args)['snp_file']
bam_file = vars(args)['bam_file']
map_qual = int(vars(args)['map_qual'])
base_qual = int(vars(args)['base_qual'])
# define two function for store snp info and call snp reads
def Load_snps(snp_file):
snp_list=[]
with open(snp_file,"r") as fh:
for row in fh:
if re.match("\#",row):
continue
snp_info = row.strip().split()
snp_list.append([snp_info[0], int(snp_info[1]), snp_info[3],snp_info[4]])
fh.close()
return sorted(snp_list)
def Reads_parse(snp, bamfh, map_qual=20, base_qual=20, snp1_count=0, snp2_count=0):
read_set1 = set()
read_set2 = set()
(chrom,bps,allele1,allele2)=snp
found = False
for pile in bamfh.pileup(region=chrom,start=bps-1,end=bps):
if pile.pos == bps-1:
pile_col=pile.pileups # <pysam.calignmentfile.PileupRead object>
found = True
break
if found:
for pile_read in pile_col:
# check whether read is in get_reads or col_reads
qname=pile_read.alignment.query_name
if not pile_read.query_position:
continue
if (pile_read.alignment.query_sequence[pile_read.query_position] == allele1
and pile_read.alignment.mapping_quality >= map_qual
and pile_read.alignment.query_qualities[pile_read.query_position] >= base_qual ) :
read_set1.add(qname)
snp1_count+=1
if (pile_read.alignment.query_sequence[pile_read.query_position] == allele2
and pile_read.alignment.mapping_quality >= map_qual
and pile_read.alignment.query_qualities[pile_read.query_position] >= base_qual ) :
read_set2.add(qname)
snp2_count+=1
return [read_set1,read_set2,snp1_count,snp2_count]
# Import bam file and snp file
snp_list = Load_snps(snp_file)
bamfh=pysam.AlignmentFile(bam_file,'rb')
outfile = open(bam_file+"_allelic_counts.txt","w")
# Count the allelic read number and store the name of SNP reads.
read_names_1 = set()
read_names_2 = set()
for snp in snp_list:
print snp
read_set=Reads_parse(snp,bamfh,map_qual=map_qual,base_qual= base_qual,snp1_count=0,snp2_count=0)
outfile.write("\t".join([str(m) for m in snp])+"\t"+str(read_set[2])+"\t"+str(read_set[3])+"\n")
read_names_1 |= read_set[0]
read_names_2 |= read_set[1]
bamfh.close()
# Output the splited bam files based on previous reads names at SNP sites
bamfh = pysam.AlignmentFile(bam_file,'rb')
outfh1 = pysam.Samfile(bam_file+"_ref.bam","wb",template=bamfh)
outfh2 = pysam.Samfile(bam_file+"_alt.bam","wb",template=bamfh)
index = 0
for read in bamfh.fetch(until_eof=True):
index += 1
if index%1000000 == 0:
print "screened "+ str(index) + " reads of " + bam_file
if read.query_name in read_names_1:
outfh1.write(read)
if read.query_name in read_names_2:
outfh2.write(read)
bamfh.close()