-
Notifications
You must be signed in to change notification settings - Fork 13
/
Copy pathparse.py
211 lines (198 loc) · 8.61 KB
/
parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os, re, datetime, gzip, sys
import csv
from Bio import SeqIO
from Bio import Entrez
import requests
import pandas as pd
import numpy as np
from unidecode import unidecode
import math
class parse(object):
def __init__(self, **kwargs):
self.table_column_names = ['viruses', 'other', 'collection', 'passage', '']
self.titer_values = ['10.0', '20.0', '40.0', '80.0', '160.0', '320.0', '640.0', '1280.0', '2560.0', '5120.0', 'nan']
def parse(self, ftype='flat', **kwargs):
'''
Parse HI data files to create list of measurement documents
'''
flat_measurements = list()
if ftype == "flat":
flat_measurements = self.read_flat(**kwargs)
elif ftype == "tables":
HI_titers = self.read_tables(**kwargs)
flat_measurements = self.table_to_flat(HI_titers)
return flat_measurements
def read_flat(self, path, fstem, **kwargs):
'''
Read flat titer table, assumes file ends with .tsv
Example line:
A/SYDNEY/5/1997 A/NETHERLANDS/22/2003 NL/22/03 Smith2004 320.0
'''
import csv
file = path + fstem + ".tsv"
flat_measurements = list()
try:
os.path.isfile(file)
except IOError:
raise Exception(file, "not found")
else:
with open(file) as infile:
table_reader = csv.reader(infile, delimiter="\t")
header = next(table_reader)
headers = {}
for i in range(len(header)):
headers[i] = header[i]
# header = {
# 0: 'virus_strain',
# 1: 'serum_strain',
# 2: 'serum_id',
# 3: 'source',
# 4: 'titer'
# }
for row in table_reader:
m = {key: row[ii] if ii < len(row) else "" for ii, key in headers.items()}
if 'ferret_id' in m.keys():
m['serum_id'] = m['ferret_id']
m.pop('ferret_id', None)
if re.search(r'[Ee][Gg][Gg]', m['serum_id']): # TODO FIX THIS FOR LATER IMPORTS
m['passage'] = 'egg'
else:
m['passage'] = 'cell'
flat_measurements.append(m)
return flat_measurements
def read_tables(self, path, fstem, **kwargs):
'''
Read all csv tables in path, create data frame with reference viruses as columns
'''
fname = path + fstem + ".csv"
# import glob
# flist = glob.glob(path + '/NIMR*csv') #BP
HI_matrices = pd.DataFrame()
tmp = self.parse_HI_matrix(fname)
HI_matrices = HI_matrices.append(tmp)
return HI_matrices
def table_to_flat(self, HI_table):
flat_measurements = list()
for ref_serum in HI_table.columns[4:]:
try:
sub_set_vals = HI_table[ref_serum][~np.isnan(HI_table[ref_serum])]
sub_set_source = HI_table['source'][~np.isnan(HI_table[ref_serum])]
sub_set_date = HI_table['collection'][~np.isnan(HI_table[ref_serum])]
sub_set_passage = HI_table['passage'][~np.isnan(HI_table[ref_serum])]
sub_set_ref = HI_table['ref/test'][~np.isnan(HI_table[ref_serum])]
for virus, val, src_id, date, passage, ref in zip(sub_set_vals.index, sub_set_vals, sub_set_source, sub_set_date, sub_set_passage, sub_set_ref):
flat_measurements.append({'virus_strain': virus, 'serum_strain': ref_serum[0], 'ferret_id': ref_serum[1], 'source': src_id, 'titer': val, 'date': date, 'passage': passage, 'ref': ref}) # BP changed 'ferret_id': ref_serum[1] to 'ferret_id': ref_serum[0] to try to parse NIMR tables 122816
except:
print("Couldn't parse this serum's measurements", ref_serum)
print("Check fields at top left of file")
return flat_measurements
def parse_HI_matrix(self, fname):
'''
Parse HI file to create dataframe with reference viruses as columns
:param fname:
:return:
'''
from string import strip
src_id = fname.split('/')[-1]
with self.myopen(fname) as infile:
csv_reader = csv.reader(infile)
# parse sera
row1 = next(csv_reader)
row2 = next(csv_reader)
row3 = next(csv_reader)
# starting 2016, included passage history row, need to get fourth row
if self.determine_source_year(src_id) >= 2016:
row3 = next(csv_reader)
fields = self.determine_columns(row1)
ref_sera_start = len(fields)
if not all(field in fields for field in ['collection', 'passage']):
fields = self.determine_columns(row2)
fields[0] = 'viruses'
ref_sera_start = len(fields)
try:
fields.remove("viruses")
except:
print("couldn't remove viruses field", fields, src_id)
row3 = [re.match(r'^([^\*]*)', id).group(0).upper() for id in row3] # get everything before the '*'?
ref_sera = [[(e1+'/'+e2), e3.replace(' ', '')] for e1, e2, e3 in zip(row1, row2, row3)[ref_sera_start:]]
fields = ['source','ref/test'] + fields + map(tuple, ref_sera)
for row in csv_reader: # advance until the reference virus
if row[0].startswith('REFERENCE'):
break
ref_strains = []
ref_matrix = []
for row in csv_reader:
if row[0].startswith('TEST'):
break
else: # load matrices until the test virus section starts
ref_strains.append(row[0].strip())
ref_matrix.append([src_id,'ref']+map(strip, row[1:ref_sera_start])+map(self.titer_to_number, row[ref_sera_start:]))
test_strains = []
test_matrix = []
for row in csv_reader:
try:
name = unidecode(row[0].strip().decode('utf-8'))
test_strains.append(name)
test_matrix.append([src_id,'test']+map(strip,row[1:ref_sera_start])+map(self.titer_to_number, row[ref_sera_start:]))
self.check_titer_values(map(self.titer_to_number, row[ref_sera_start:]), src_id)
except:
print("Couldn't parse name from file", row[0].strip(), src_id)
HI_table = pd.DataFrame(ref_matrix+test_matrix, index = ref_strains+test_strains, columns= fields)
# get rid of columns 'other' and ''
if 'other' in HI_table:
HI_table = HI_table.drop('other', 1)
while '' in HI_table:
HI_table = HI_table.drop('', 1)
return HI_table
def determine_columns(self, row1):
fields = []
for col in row1:
if col.strip().lower() not in self.table_column_names:
break
else:
fields.append(col.strip().lower())
return fields
def determine_source_year(self, src_id):
'''
# starting 2016, included passage history row
'''
year = 0
if re.match(r'\D+(\d\d\d\d)', src_id): #NIMR_Feb2012_10.csv, NIMR-report-Feb2011_04.csv
year = re.match(r'\D+(\d\d\d\d)', src_id).group(1)
try:
year = int(year)
except:
print("couldn't source file name to get year")
return year
def titer_to_number(self, val):
try:
if '<' in val:
return np.nan
elif '>' in val:
return float(val)
elif re.match(r'0\s+([0-9]+)', val): # 0 160
val = re.match(r'0\s+([0-9]+)', val).group(1)
return float(val)
elif val + '.0' not in self.titer_values:
temp = str(float(val) * 10.0)
if temp in self.titer_values:
return float(temp)
return float(val)
except:
#print("Bad HI measurement:", val)
return np.nan
def check_titer_values(self, titers, src_id):
'''
look for titer values that are not normal
'''
for t in titers:
t = str(t)
if t not in self.titer_values:
print("Weird titer value", t, src_id)
def myopen(self, fname, mode='rU'):
if fname[-2:]=='gz':
return gzip.open(fname, mode)
else:
return open(fname, mode)