-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgbk2proteome.molec.weights.py
executable file
·48 lines (40 loc) · 1.61 KB
/
gbk2proteome.molec.weights.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
#!/usr/bin/env python
# Given a genbank file, prints to stdout a tab-delimited summary
# of the proteome including the molecular weight in Daltons, where
# '~<value>' indicates the MW is estimated due to ambiguous amino acid(s)
# Usage: python script.py <input.gbk>
import os
import string
import sys
from argparse import ArgumentParser
from Bio import SeqIO, SeqUtils
from decimal import Decimal, ROUND_UP
def parseArgs():
parser = ArgumentParser(description='Calculates the molecular weight of proteins in a GenBank file')
parser.add_argument('-i', '--infile', required=True, help='input GenBank file with translations')
args = parser.parse_args()
return args
def calc_molec_weight(protein_seq):
return Decimal(SeqUtils.molecular_weight(protein_seq, seq_type='protein')
).quantize(Decimal('.001'), rounding=ROUND_UP)
def main():
opts = parseArgs()
infile = opts.infile
gb = SeqIO.parse(infile, 'genbank')
print 'Locus_Tag\tProduct\tProtein_MolecWt_[Da]\tProtein_Seq' #Header
for rec in gb:
for s in rec.features:
if (s.type == 'CDS' and 'translation') in s.qualifiers:
locus = s.qualifiers['locus_tag'][0]
product = s.qualifiers['product'][0]
protein = s.qualifiers['translation'][0]
if 'X' in protein:
num_X = protein.count('X')
estimated_mw = Decimal(num_X) * Decimal('128.16') #UNK=(C5)(H6)(N1)(O3)=128.16
peptide_mw = calc_molec_weight(protein.replace('X', ''))
protein_mw = '~' + str(peptide_mw + estimated_mw)
else:
protein_mw = calc_molec_weight(protein)
print '{}\t{}\t{}\t{}'.format(locus,product,protein_mw,protein)
if __name__ == '__main__':
main()