Skip to content

Commit

Permalink
Recognize mod labels as terminal groups and auto-add hydrogen. Recogn…
Browse files Browse the repository at this point in the history
…ize formulas in term groups in Composition
  • Loading branch information
levitsky committed Jun 15, 2024
1 parent 3475ff7 commit 07b425a
Show file tree
Hide file tree
Showing 3 changed files with 95 additions and 21 deletions.
20 changes: 18 additions & 2 deletions doc/source/mass.rst
Original file line number Diff line number Diff line change
Expand Up @@ -152,8 +152,7 @@ modification on a specific residue:
`Unimod database <http://www.unimod.org>`_ is an excellent resource for the
information on the chemical compositions of known protein modifications.
Version 2.0.3 introduces :py:class:`pyteomics.mass.Unimod` class that can serve
as a Python interface to Unimod:
:py:class:`pyteomics.mass.Unimod` class is a simple Python interface to Unimod:

.. code-block:: python
Expand Down Expand Up @@ -189,6 +188,23 @@ as a Python interface to Unimod:
>>> mass.calculate_mass('Ac-PEPacTIDE-OH', aa_comp=aa_comp)
883.38109339411 # correct!
The following will produce the same result. **Pyteomics** recognizes that you are using a *mod* label instead
of a terminal group and adds a hydrogen implicitly:

.. code-block:: python
>>> aa_comp = dict(mass.std_aa_comp)
>>> aa_comp['ac'] = db.by_title('Acetyl')['composition']
>>> mass.calculate_mass('ac-PEPacTIDE-OH', aa_comp=aa_comp)
883.38109339411 # correct!
For completeness, note that you can actually specify terminal groups directly by their formula in the sequence:

.. code-block:: python
>>> mass.calculate_mass('CH3CO-PEPacTIDE-OH', aa_comp=aa_comp)
883.38109339411
Faster mass calculations
------------------------

Expand Down
42 changes: 31 additions & 11 deletions pyteomics/mass/mass.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,10 +164,25 @@ class Composition(BasicComposition):
def _from_parsed_sequence(self, parsed_sequence, aa_comp):
self.clear()
comp = defaultdict(int)
failflag = False
for label in parsed_sequence:
if label in aa_comp:
for elem, cnt in aa_comp[label].items():
comp[elem] += cnt
elif parser.is_term_group(label):
slabel = label.strip('-')
if slabel in aa_comp:
# a modification label used as terminal group. Need to add one hydrogen to the comp
if not slabel.islower():
failflag = True
else:
comp += aa_comp[slabel]
comp['H'] += 1

elif re.match(_formula, slabel):
comp += Composition(formula=slabel)
else:
failflag = True
else:
try:
mod, aa = parser._split_label(label)
Expand All @@ -176,7 +191,9 @@ def _from_parsed_sequence(self, parsed_sequence, aa_comp):
comp[elem] += cnt

except (PyteomicsError, KeyError):
raise PyteomicsError('No information for %s in `aa_comp`' % label)
failflag = True
if failflag:
raise PyteomicsError('No information for %s in `aa_comp`' % label)
self._from_composition(comp)

def _from_split_sequence(self, split_sequence, aa_comp):
Expand All @@ -186,23 +203,23 @@ def _from_split_sequence(self, split_sequence, aa_comp):
i = 0
while i < len(group):
for j in range(len(group) + 1, -1, -1):
try:
label = ''.join(group[i:j])
label = ''.join(group[i:j])
if label in aa_comp:
for elem, cnt in aa_comp[label].items():
comp[elem] += cnt
except KeyError:
continue
elif parser.is_term_group(label) and label.strip('-') in aa_comp:
comp += aa_comp[label.strip('-')] + {'H': 1}
else:
i = j
break
continue
i = j
break
if j == 0:
raise PyteomicsError("Invalid group starting from position %d: %s" % (i + 1, group))
self._from_composition(comp)

def _from_sequence(self, sequence, aa_comp):
parsed_sequence = parser.parse(
sequence,
labels=aa_comp,
show_unmodified_termini=True)
self._from_parsed_sequence(parsed_sequence, aa_comp)

Expand Down Expand Up @@ -255,8 +272,7 @@ def __init__(self, *args, **kwargs):
Parameters
----------
formula : str, optional
A string with a chemical formula. All elements must be present in
`mass_data`.
A string with a chemical formula.
sequence : str, optional
A polypeptide sequence string in modX notation.
parsed_sequence : list of str, optional
Expand Down Expand Up @@ -999,7 +1015,11 @@ def fast_mass2(sequence, ion_type=None, charge=None, **kwargs):
mass += aa_mass[aa] * num
elif parser.is_term_mod(aa):
assert num == 1
mass += calculate_mass(formula=aa.strip('-'), mass_data=mass_data)
group = aa.strip('-')
if group in aa_mass:
mass += aa_mass[group] + mass_data['H'][0][0]
else:
mass += calculate_mass(formula=group, mass_data=mass_data)
else:
mod, X = parser._split_label(aa)
mass += (aa_mass[mod] + aa_mass[X]) * num
Expand Down
54 changes: 46 additions & 8 deletions tests/test_mass.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,17 +40,17 @@ def setUp(self):
for i in range(10)]

self.aa_comp = {
'X': mass.Composition({'A': 1}, mass_data=self.mass_data),
'Y': mass.Composition({'B': 1}, mass_data=self.mass_data),
'Z': mass.Composition({'C': 1}, mass_data=self.mass_data),
'F': mass.Composition({'F': 1}, mass_data=self.mass_data),
'H-': mass.Composition({'D': 1}, mass_data=self.mass_data),
'-OH': mass.Composition({'E': 1}, mass_data=self.mass_data),
'X': mass.Composition({'A': 1}),
'Y': mass.Composition({'B': 1}),
'Z': mass.Composition({'C': 1}),
'F': mass.Composition({'F': 1}),
'H-': mass.Composition({'D': 1}),
'-OH': mass.Composition({'E': 1}),
}

self.ion_comp = {
'M': mass.Composition({}, mass_data=self.mass_data),
'a': mass.Composition({'A': -1}, mass_data=self.mass_data)
'M': mass.Composition({}),
'a': mass.Composition({'A': -1})
}

self.mods = {'a': mass.Composition(A=1), 'b': mass.Composition(B=1)}
Expand Down Expand Up @@ -78,6 +78,40 @@ def test_fast_mass2_term(self):
self.mass_data['A'][0][0] + self.mass_data['B'][0][0] * 2 + self.mass_data['C'][0][0] * 3 +
self.mass_data['D'][0][0] + self.mass_data['E'][0][0] * 2 + self.mass_data['F'][0][0] * 3))

def test_fast_mass2_term_label(self):
mass_data = dict(self.mass_data)
mass_data['H'] = {0: (self.mass_H, 1.0)}
mass_data['O'] = {0: (self.mass_O, 1.0)}
aa_mass = self.test_aa_mass.copy()
aa_mass.update({k: mass.calculate_mass(composition=v, mass_data=mass_data) for k, v in self.mods.items()})
for pep in self.random_peptides:
for mlabel, mcomp in self.mods.items():
mpep = mlabel + '-' + pep + '-' + mlabel
self.assertAlmostEqual(
mass.fast_mass2(mpep, mass_data=mass_data, aa_mass=aa_mass),
mass.fast_mass2(pep, mass_data=mass_data, aa_mass=aa_mass)
+ 2 * mass.calculate_mass(composition=mcomp, mass_data=mass_data)
- self.mass_O
)

def test_composition_term(self):
aa_comp = self.aa_comp.copy()
aa_comp.update(self.mods)
for pep in self.random_peptides:
for mlabel, mcomp in self.mods.items():
mpep = mlabel + '-' + pep + '-' + mlabel
self.assertEqual(mass.Composition(sequence=mpep, aa_comp=aa_comp),
mass.Composition(sequence=pep, aa_comp=aa_comp) - aa_comp['H-'] - aa_comp['-OH'] + mcomp * 2 + {'H': 2})

def test_composition_term_sseq(self):
aa_comp = self.aa_comp.copy()
aa_comp.update(self.mods)
for pep in self.random_peptides:
for mlabel, mcomp in self.mods.items():
split_sequence = parser.parse(pep, split=True)
self.assertEqual(mass.Composition(split_sequence=[
(mlabel + '-',) + split_sequence[0]] + split_sequence[1:-1] + [split_sequence[-1] + ('-' + mlabel,)], aa_comp=aa_comp),
mass.Composition(sequence=pep, aa_comp=aa_comp) - aa_comp['H-'] - aa_comp['-OH'] + mcomp * 2 + {'H': 2})

def test_Composition_dict(self):
# Test Composition from a dict.
Expand All @@ -103,6 +137,10 @@ def test_Composition_sseq(self):
mass.Composition(split_sequence=[('X',), ('Y',), ('Z',)], aa_comp=self.aa_comp),
{atom: 1 for atom in 'ABC'})

def test_Composition_term_formula(self):
self.assertEqual(mass.Composition(sequence='A2B-XYZ-DE2F3', aa_comp=self.aa_comp),
{'A': 3, 'B': 2, 'C': 1, 'D': 1, 'E': 2, 'F': 3})

def test_Composition_sum(self):
# Test sum of Composition objects.
self.assertEqual(
Expand Down

0 comments on commit 07b425a

Please sign in to comment.