-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathBamReader.h
executable file
·201 lines (183 loc) · 5.85 KB
/
BamReader.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
/**
** A class with utilities for reading bam files.
**
** Author: Nadia Davidson, [email protected]
** Modified:
**/
#include <string>
#include <iostream>
#include <sstream>
#include <vector>
//depends on samtools header files
#include "htslib/sam.h"
#include "htslib/faidx.h"
#include "htslib/kstring.h"
#include "htslib/khash.h"
#include "samtools.h"
using namespace std;
class BamReader{
samFile * in; /
bam1_t *b;
hts_idx_t *idx;
bam_hdr_t *header;
int nread;
public:
void setFile(string filename){
if((in = sam_open(filename.c_str(), "r")) == 0) {
cerr << "failed to open "<< filename << " for reading." << endl;
exit(1);
}
if((header = sam_hdr_read(in)) == 0) {
cerr << "failed to open "<< filename << " header for reading." << endl;
exit(1);
}
//load the index
idx = 0;
idx = bam_index_load(filename.c_str());
if(idx==0){
cerr << "failed find index file for "<< filename << endl;
exit(1);
}
b = bam_init1();
nread=0;
}
string get_next_bad_map_seq(int flank_size, int min_gap){
while(sam_read1(in, header, b) >= 0){
nread++;
if( nread % 1000000 == 0 ) cerr << nread/1000000 << " million reads processed" << endl;
int seq_length=b->core.l_qseq;
uint32_t *cigar = bam_get_cigar(b); //bam1_cigar(b);
int matched=0;
int rest=0;
for(int k=0; k < b->core.n_cigar; k++){
int c_oper=(cigar[k])&BAM_CIGAR_MASK;
int c_size=(cigar[k])>>BAM_CIGAR_SHIFT;
if(c_oper==BAM_CMATCH)
matched+=c_size;
else
rest+=c_size;
}
if((seq_length-matched)<flank_size && rest < min_gap) continue ;
//get the read sequence
char qseq[seq_length];
uint8_t * s = bam_get_seq(b); //bam1_seq(b);
for(int n=0; n<seq_length; n++){
char v = bam_seqi(s,n); //bam1_seqi(s,n);
qseq[n] = seq_nt16_str[v]; //bam_nt16_rev_table[v];
}
return(string(qseq,seq_length));
}
return("");
};
bool is_current_read_unaligned(){
return(b->core.flag & BAM_FUNMAP);
};
int get_coverage(string chrom, int pos1, int pos2){
//have to do this formatting hack to get the chromosome index.
stringstream formatted_pos;
formatted_pos << chrom << ":" << pos1 << "-" << pos2;
int result=0;
hts_itr_t *iter = sam_itr_querys(idx,header,formatted_pos.str().c_str());
if(iter==NULL) return -1;
while ( sam_itr_next(in, iter, b) >= 0) result++;
hts_itr_destroy(iter);
return result;
}
vector< pair<int,int> > get_bad_pair_positions(string chrom,int start,int end){
vector< pair<int,int> > result;
stringstream formatted_pos;
formatted_pos << chrom << ":" << start << "-" << end;
hts_itr_t *iter = sam_itr_querys(idx,header,formatted_pos.str().c_str());
if(iter==NULL) return result;
while(sam_itr_next(in, iter, b) >= 0){
int proper_pair = b->core.flag & BAM_FPROPER_PAIR;
int mate_unmapped = b->core.flag & BAM_FMUNMAP;
if(!proper_pair & !mate_unmapped ){
bool diff_chrom = header->target_name[b->core.tid] != header->target_name[b->core.mtid];
if(!diff_chrom & (bam_is_rev(b)!=bam_is_mrev(b)) & (b->core.pos < b->core.mpos)){
result.push_back( make_pair(b->core.pos+b->core.l_qseq,b->core.mpos));
}
else if(diff_chrom | bam_is_rev(b)==bam_is_mrev(b) & b->core.pos!=b->core.mpos )
cout << "Non-linear: " << header->target_name[b->core.tid] << ":" << b->core.pos << "\t"
<< header->target_name[b->core.mtid] << ":" << b->core.mpos << endl;
}
}
hts_itr_destroy(iter);
return result;
}
pair<int, int> get_allele_depth(string & chrom, int & pos){
stringstream formatted_pos;
formatted_pos << chrom << ":" << pos << "-" << pos;
hts_itr_t *iter = sam_itr_querys(idx,header,formatted_pos.str().c_str());
if(iter==NULL) return make_pair(-1,-1);
int tally[4]={0};
while ( sam_itr_next(in, iter, b) >= 0){
//first work out the offset between the start of the mapped position
//and the SNP position
uint32_t map_pos=b->core.pos;
int diff=pos-map_pos;
//now get the cigar string and work out which base in the read we need
//to look at.
uint32_t *cigar = bam_get_cigar(b);
int read_offset=0, k=0;
stringstream cig;
while( k < b->core.n_cigar & diff>=0){
int c_oper=(cigar[k])&BAM_CIGAR_MASK;
int c_size=(cigar[k])>>BAM_CIGAR_SHIFT;
cig << c_size ;
switch(c_oper)
{
case BAM_CMATCH: {
diff-=c_size; cig << "M" ;
read_offset+=c_size;
break;
}
case BAM_CINS:{ cig << "I" ;
read_offset+=c_size; break ;}
case BAM_CDEL: { cig << "D" ;
diff-=c_size; break;}
case BAM_CREF_SKIP: { cig << "N" ;
diff-=c_size; break;}
case BAM_CSOFT_CLIP: { cig << "S" ;
read_offset+=c_size; break;}
default:
break;
}
k++;
}
read_offset+=diff; //final adjustment
if(read_offset>b->core.l_qseq | read_offset<0 | diff>0)
continue;
/** cout << " Pos="<<pos<<" MapPos=" << map_pos ;
cout << " Diff="<<pos-map_pos;
cout << " ReadOffset="<<read_offset ;
cout << " ReadLength="<<b->core.l_qseq;
cout << " Cigar=" << cig.str();**/
uint8_t * seq = bam_get_seq(b); //bam1_seq(b);
char base = bam_seqi(seq,read_offset); //bam1_seqi(s,n);
tally[seq_nt16_int[base]]++;
// cout << " " << seq_nt16_str[base] << endl;
}
pair<int,int> allele_count(0,0);
for(int t=0; t<4 ; t++){
if(tally[t]>allele_count.first){
allele_count.second=allele_count.first;
allele_count.first=tally[t];
} else if ( tally[t]>allele_count.second ){
allele_count.second=tally[t];
}
// cout << t << ":" << tally[t] << " ";
}
// cout << endl;
hts_itr_destroy(iter);
return allele_count;
}
int get_nreads(){
return nread;
}
void destroy(){
bam_destroy1(b);
bam_hdr_destroy(header);
sam_close(in);
}
} ;