-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathdata_iter.py
99 lines (94 loc) · 4.08 KB
/
data_iter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import numpy as np
import pdb
try:
import pyBigWig
except:
pdb.set_trace()
import pyBigWig
from keras.preprocessing.image import Iterator
# Modified from keras
class DataIterator(Iterator):
def __init__(self, data_list, genome, batch_size, seqlen, bigwig_rc_order=None, shuffle=False, seed=1337):
self.data_list = data_list
if data_list is None or len(data_list) == 0:
self.num_bigwigs = 0
else:
self.num_bigwigs = len(data_list[0][4])
self.num_meta = len(data_list[0][5])
if bigwig_rc_order is None:
self.bigwig_rc_order = np.arange(self.num_bigwigs)
else:
self.bigwig_rc_order = bigwig_rc_order
self.genome = genome
self.seqlen = seqlen
self.nucleotides = np.array(['A', 'C', 'G', 'T'])
if data_list is None or len(data_list) == 0:
self.labeled = False
else:
self.labeled = len(data_list[0]) == 7
if self.labeled:
self.num_tfs = len(data_list[0][6])
super(DataIterator, self).__init__(len(data_list), batch_size, shuffle, seed)
def __len__(self):
return len(self.data_list)
def next(self):
# for python 2.x.
# Keeps under lock only the mechanism which advances
# the indexing of each batch
# see http://anandology.com/blog/using-iterators-and-generators/
with self.lock:
index_array, current_index, current_batch_size = next(self.index_generator)
batch_X_seq = np.zeros((current_batch_size, self.seqlen, 4), dtype=bool)
batch_X_bigwig = np.zeros((current_batch_size, self.seqlen, self.num_bigwigs), dtype=np.float32)
if self.num_meta:
batch_X_meta = np.zeros((current_batch_size, self.num_meta), dtype=np.float32)
if self.labeled:
batch_y = np.zeros((current_batch_size, self.num_tfs), dtype=bool)
for i, j in enumerate(index_array):
data = self.data_list[j]
chrom = data[0]
start = data[1]
stop = data[2]
shift = data[3]
bigwig_files = data[4]
meta = data[5]
if shift:
s = np.random.randint(-shift, shift+1)
start += s
stop += s
med = (start + stop) / 2
start = med - self.seqlen / 2
stop = med + self.seqlen / 2
batch_X_seq[i] = self.genome[chrom][start:stop]
if self.num_meta:
batch_X_meta[i] = meta
for k, bigwig_file in enumerate(bigwig_files):
bigwig = pyBigWig.open(bigwig_file)
sample_bigwig = np.array(bigwig.values(chrom, start, stop))
bigwig.close()
sample_bigwig[np.isnan(sample_bigwig)] = 0
batch_X_bigwig[i, :, k] = sample_bigwig
if k == 2:
#batch_X_bigwig[i, :, k-1] = 0.5*batch_X_bigwig[i, :, k-1]+0.5*batch_X_bigwig[i, :, k]
batch_X_bigwig[i, :, k-1] = (1-0.75)*batch_X_bigwig[i, :, k-1]+0.75*batch_X_bigwig[i, :, k]
if self.labeled:
batch_y[i] = data[6]
# otherwise the binding code is 'U', so leave as 0
batch_X_seq_rc = batch_X_seq[:, ::-1, ::-1]
if k == 2:
batch_X_bigwig = batch_X_bigwig[:,:,:2]
batch_X_bigwig_rc = batch_X_bigwig[:, ::-1, self.bigwig_rc_order[:2]]
else:
batch_X_bigwig_rc = batch_X_bigwig[:, ::-1, self.bigwig_rc_order]
#batch_X_bigwig_rc = batch_X_bigwig[:, ::-1, self.bigwig_rc_order]
batch_X_fwd = np.concatenate([batch_X_seq, batch_X_bigwig,batch_X_bigwig_rc], axis=-1)
batch_X_rev = np.concatenate([batch_X_seq_rc, batch_X_bigwig_rc,batch_X_bigwig], axis=-1)
batch_X_fwd = np.expand_dims(batch_X_fwd,axis=1)
batch_X_rev = np.expand_dims(batch_X_rev,axis=1)
if self.num_meta:
batch_x = [batch_X_fwd, batch_X_rev, batch_X_meta]
else:
batch_x = batch_X_fwd
if self.labeled:
return batch_x, batch_y
return batch_x