Recognize mod labels as terminal groups and auto-add hydrogen. Recogn…

…ize formulas in term groups in Composition
levitsky · Jun 15, 2024 · 07b425a · 07b425a
1 parent 3475ff7
commit 07b425a
Show file tree

Hide file tree

Showing 3 changed files with 95 additions and 21 deletions.
diff --git a/doc/source/mass.rst b/doc/source/mass.rst
@@ -152,8 +152,7 @@ modification on a specific residue:
 
 `Unimod database <http://www.unimod.org>`_ is an excellent resource for the
 information on the chemical compositions of known protein modifications.
-Version 2.0.3 introduces :py:class:`pyteomics.mass.Unimod` class that can serve
-as a Python interface to Unimod:
+:py:class:`pyteomics.mass.Unimod` class is a simple Python interface to Unimod:
 
 .. code-block:: python
 
@@ -189,6 +188,23 @@ as a Python interface to Unimod:
         >>> mass.calculate_mass('Ac-PEPacTIDE-OH', aa_comp=aa_comp)
         883.38109339411  # correct!
 
+    The following will produce the same result. **Pyteomics** recognizes that you are using a *mod* label instead
+    of a terminal group and adds a hydrogen implicitly:
+
+    .. code-block:: python
+
+        >>> aa_comp = dict(mass.std_aa_comp)
+        >>> aa_comp['ac'] = db.by_title('Acetyl')['composition']
+        >>> mass.calculate_mass('ac-PEPacTIDE-OH', aa_comp=aa_comp)
+        883.38109339411  # correct!
+
+    For completeness, note that you can actually specify terminal groups directly by their formula in the sequence:
+
+    .. code-block:: python
+
+        >>> mass.calculate_mass('CH3CO-PEPacTIDE-OH', aa_comp=aa_comp)
+        883.38109339411
+
 Faster mass calculations
 ------------------------
 

diff --git a/pyteomics/mass/mass.py b/pyteomics/mass/mass.py
@@ -164,10 +164,25 @@ class Composition(BasicComposition):
     def _from_parsed_sequence(self, parsed_sequence, aa_comp):
         self.clear()
         comp = defaultdict(int)
+        failflag = False
         for label in parsed_sequence:
             if label in aa_comp:
                 for elem, cnt in aa_comp[label].items():
                     comp[elem] += cnt
+            elif parser.is_term_group(label):
+                slabel = label.strip('-')
+                if slabel in aa_comp:
+                    # a modification label used as terminal group. Need to add one hydrogen to the comp
+                    if not slabel.islower():
+                        failflag = True
+                    else:
+                        comp += aa_comp[slabel]
+                        comp['H'] += 1
+
+                elif re.match(_formula, slabel):
+                    comp += Composition(formula=slabel)
+                else:
+                    failflag = True
             else:
                 try:
                     mod, aa = parser._split_label(label)
@@ -176,7 +191,9 @@ def _from_parsed_sequence(self, parsed_sequence, aa_comp):
                         comp[elem] += cnt
 
                 except (PyteomicsError, KeyError):
-                    raise PyteomicsError('No information for %s in `aa_comp`' % label)
+                    failflag = True
+            if failflag:
+                raise PyteomicsError('No information for %s in `aa_comp`' % label)
         self._from_composition(comp)
 
     def _from_split_sequence(self, split_sequence, aa_comp):
@@ -186,23 +203,23 @@ def _from_split_sequence(self, split_sequence, aa_comp):
             i = 0
             while i < len(group):
                 for j in range(len(group) + 1, -1, -1):
-                    try:
-                        label = ''.join(group[i:j])
+                    label = ''.join(group[i:j])
+                    if label in aa_comp:
                         for elem, cnt in aa_comp[label].items():
                             comp[elem] += cnt
-                    except KeyError:
-                        continue
+                    elif parser.is_term_group(label) and label.strip('-') in aa_comp:
+                        comp += aa_comp[label.strip('-')] + {'H': 1}
                     else:
-                        i = j
-                        break
+                        continue
+                    i = j
+                    break
                 if j == 0:
                     raise PyteomicsError("Invalid group starting from position %d: %s" % (i + 1, group))
         self._from_composition(comp)
 
     def _from_sequence(self, sequence, aa_comp):
         parsed_sequence = parser.parse(
             sequence,
-            labels=aa_comp,
             show_unmodified_termini=True)
         self._from_parsed_sequence(parsed_sequence, aa_comp)
 
@@ -255,8 +272,7 @@ def __init__(self, *args, **kwargs):
         Parameters
         ----------
         formula : str, optional
-            A string with a chemical formula. All elements must be present in
-            `mass_data`.
+            A string with a chemical formula.
         sequence : str, optional
             A polypeptide sequence string in modX notation.
         parsed_sequence : list of str, optional
@@ -999,7 +1015,11 @@ def fast_mass2(sequence, ion_type=None, charge=None, **kwargs):
                 mass += aa_mass[aa] * num
             elif parser.is_term_mod(aa):
                 assert num == 1
-                mass += calculate_mass(formula=aa.strip('-'), mass_data=mass_data)
+                group = aa.strip('-')
+                if group in aa_mass:
+                    mass += aa_mass[group] + mass_data['H'][0][0]
+                else:
+                    mass += calculate_mass(formula=group, mass_data=mass_data)
             else:
                 mod, X = parser._split_label(aa)
                 mass += (aa_mass[mod] + aa_mass[X]) * num

diff --git a/tests/test_mass.py b/tests/test_mass.py
@@ -40,17 +40,17 @@ def setUp(self):
             for i in range(10)]
 
         self.aa_comp = {
-            'X':   mass.Composition({'A': 1}, mass_data=self.mass_data),
-            'Y':   mass.Composition({'B': 1}, mass_data=self.mass_data),
-            'Z':   mass.Composition({'C': 1}, mass_data=self.mass_data),
-            'F':   mass.Composition({'F': 1}, mass_data=self.mass_data),
-            'H-':  mass.Composition({'D': 1}, mass_data=self.mass_data),
-            '-OH': mass.Composition({'E': 1}, mass_data=self.mass_data),
+            'X':   mass.Composition({'A': 1}),
+            'Y':   mass.Composition({'B': 1}),
+            'Z':   mass.Composition({'C': 1}),
+            'F':   mass.Composition({'F': 1}),
+            'H-':  mass.Composition({'D': 1}),
+            '-OH': mass.Composition({'E': 1}),
         }
 
         self.ion_comp = {
-            'M': mass.Composition({}, mass_data=self.mass_data),
-            'a': mass.Composition({'A': -1}, mass_data=self.mass_data)
+            'M': mass.Composition({}),
+            'a': mass.Composition({'A': -1})
         }
 
         self.mods = {'a': mass.Composition(A=1), 'b': mass.Composition(B=1)}
@@ -78,6 +78,40 @@ def test_fast_mass2_term(self):
                     self.mass_data['A'][0][0] + self.mass_data['B'][0][0] * 2 + self.mass_data['C'][0][0] * 3 +
                     self.mass_data['D'][0][0] + self.mass_data['E'][0][0] * 2 + self.mass_data['F'][0][0] * 3))
 
+    def test_fast_mass2_term_label(self):
+        mass_data = dict(self.mass_data)
+        mass_data['H'] = {0: (self.mass_H, 1.0)}
+        mass_data['O'] = {0: (self.mass_O, 1.0)}
+        aa_mass = self.test_aa_mass.copy()
+        aa_mass.update({k: mass.calculate_mass(composition=v, mass_data=mass_data) for k, v in self.mods.items()})
+        for pep in self.random_peptides:
+            for mlabel, mcomp in self.mods.items():
+                mpep = mlabel + '-' + pep + '-' + mlabel
+                self.assertAlmostEqual(
+                    mass.fast_mass2(mpep, mass_data=mass_data, aa_mass=aa_mass),
+                    mass.fast_mass2(pep, mass_data=mass_data, aa_mass=aa_mass)
+                        + 2 * mass.calculate_mass(composition=mcomp, mass_data=mass_data)
+                        - self.mass_O
+                    )
+
+    def test_composition_term(self):
+        aa_comp = self.aa_comp.copy()
+        aa_comp.update(self.mods)
+        for pep in self.random_peptides:
+            for mlabel, mcomp in self.mods.items():
+                mpep = mlabel + '-' + pep + '-' + mlabel
+                self.assertEqual(mass.Composition(sequence=mpep, aa_comp=aa_comp),
+                    mass.Composition(sequence=pep, aa_comp=aa_comp) - aa_comp['H-'] - aa_comp['-OH'] + mcomp * 2 + {'H': 2})
+
+    def test_composition_term_sseq(self):
+        aa_comp = self.aa_comp.copy()
+        aa_comp.update(self.mods)
+        for pep in self.random_peptides:
+            for mlabel, mcomp in self.mods.items():
+                split_sequence = parser.parse(pep, split=True)
+                self.assertEqual(mass.Composition(split_sequence=[
+                    (mlabel + '-',) + split_sequence[0]] + split_sequence[1:-1] + [split_sequence[-1] + ('-' + mlabel,)], aa_comp=aa_comp),
+                    mass.Composition(sequence=pep, aa_comp=aa_comp) - aa_comp['H-'] - aa_comp['-OH'] + mcomp * 2 + {'H': 2})
 
     def test_Composition_dict(self):
         # Test Composition from a dict.
@@ -103,6 +137,10 @@ def test_Composition_sseq(self):
             mass.Composition(split_sequence=[('X',), ('Y',), ('Z',)], aa_comp=self.aa_comp),
             {atom: 1 for atom in 'ABC'})
 
+    def test_Composition_term_formula(self):
+        self.assertEqual(mass.Composition(sequence='A2B-XYZ-DE2F3', aa_comp=self.aa_comp),
+            {'A': 3, 'B': 2, 'C': 1, 'D': 1, 'E': 2, 'F': 3})
+
     def test_Composition_sum(self):
         # Test sum of Composition objects.
         self.assertEqual(