-
Notifications
You must be signed in to change notification settings - Fork 13
/
Copy pathelife_upload.py
105 lines (100 loc) · 4.56 KB
/
elife_upload.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import os, re, time, datetime, csv, sys, json
from upload import upload
from rethinkdb import r
from Bio import SeqIO
import argparse
from parse import parse
from upload import parser
sys.path.append('') # need to import from base
from base.rethink_io import rethink_io
from vdb.flu_upload import flu_upload
class elife_upload(upload):
def __init__(self, **kwargs):
upload.__init__(self, **kwargs)
def upload(self, ftype='flat', preview=False, **kwargs):
'''
format virus information, then upload to database
'''
print("Uploading Viruses to TDB")
measurements = self.parse(ftype, **kwargs)
print('Formatting documents for upload')
self.format_measurements(measurements, **kwargs)
measurements = self.filter(measurements)
measurements = self.create_index(measurements)
print('Total number of indexes', len(self.indexes), 'Total number of measurements', len(measurements))
if not preview:
self.upload_documents(self.table, measurements, index='index', **kwargs)
else:
print("Titer Measurements:")
print(json.dumps(measurements[0], indent=1))
print("Remove \"--preview\" to upload documents")
print("Printed preview of viruses to be uploaded to make sure fields make sense")
def format_measurements(self, measurements, **kwargs):
'''
format virus information in preparation to upload to database table
'''
self.fix_whole_name = self.define_strain_fixes(self.strain_fix_fname)
self.HI_ref_name_abbrev =self.define_strain_fixes(self.HI_ref_name_abbrev_fname)
self.define_location_label_fixes("source-data/flu_fix_location_label.tsv")
self.define_countries("source-data/geo_synonyms.tsv")
for meas in measurements:
meas['virus_strain'], meas['original_virus_strain'] = self.fix_name(self.HI_fix_name(meas['virus_strain'], serum=False))
meas['serum_strain'], meas['original_serum_strain'] = self.fix_name(self.HI_fix_name(meas['serum_strain'], serum=True))
self.test_location(meas['virus_strain'])
self.test_location(meas['serum_strain'])
self.add_attributes(meas, **kwargs)
self.format_subtype(meas)
self.format_assay_type(meas)
self.format_date(meas)
tmp = kwargs['fstem'].split('-')[0]
if len(tmp) > 8:
tmp = tmp[:(8-len(tmp))]
elif len(tmp) < 8:
meas['assay_date'] = "XXXX-XX-XX"
else:
if tmp[0:2] == '20':
meas['assay_date'] = "{}-{}-{}".format(tmp[0:4],tmp[4:6],tmp[6:8])
else:
meas['assay_date'] = "XXXX-XX-XX"
if 'assay_date' not in meas.keys() or meas['assay_date'] is None:
meas['assay_date'] = "XXXX-XX-XX"
self.format_passage(meas, 'serum_passage', 'serum_passage_category')
self.format_passage(meas, 'virus_passage', 'virus_passage_category')
self.format_ref(meas)
self.format_serum_sample(meas)
if meas['ref'] == True:
self.ref_serum_strains.add(meas['serum_strain'])
self.ref_virus_strains.add(meas['virus_strain'])
if meas['ref'] == False:
self.test_virus_strains.add(meas['virus_strain'])
self.rethink_io.check_optional_attributes(meas, self.optional_fields)
if len(self.new_different_date_format) > 0:
print("Found files that had a different date format, need to add to self.different_date_format")
print(self.new_different_date_format)
self.check_strain_names(measurements)
self.disambiguate_sources(measurements)
return measurements
def disambiguate_sources(self, measurements):
'''
Add counter to sources so that create_index still creates unique identifiers for each
titer value.
'''
sources = {}
for meas in measurements:
src = meas['source']
if src is None:
src = 'UnknownSource'
if src not in sources.keys():
sources[src] = 0
else:
sources[src] += 1
new_src = src + '_' + str(sources[src])
meas['source'] = new_src
if __name__=="__main__":
args = parser.parse_args()
if args.path is None:
args.path = "data/"
if not os.path.isdir(args.path):
os.makedirs(args.path)
connTDB = elife_upload(**args.__dict__)
connTDB.upload(**args.__dict__)