Add support for alphapept, sage, msfragger, and wombat

Proteobench · Nov 23, 2023 · 877a471 · 877a471
1 parent 4d4f87e
commit 877a471
Show file tree

Hide file tree

Showing 3 changed files with 149 additions and 5 deletions.
diff --git a/proteobench/modules/dda_quant/module.py b/proteobench/modules/dda_quant/module.py
@@ -14,7 +14,12 @@
 
 from proteobench.github.gh import clone_repo, pr_github, read_results_json_repo
 from proteobench.modules.dda_quant.datapoint import Datapoint
-from proteobench.modules.dda_quant.parse import ParseInputs
+from proteobench.modules.dda_quant.parse import (
+    ParseInputs,
+    get_proforma_alphapept,
+    get_proforma_msfragger,
+    get_proforma_sage,
+)
 from proteobench.modules.dda_quant.parse_settings import (
     DDA_QUANT_RESULTS_REPO,
     ParseSettings,
@@ -174,12 +179,22 @@ def load_input_file(self, input_csv: str, input_format: str) -> pd.DataFrame:
 
         elif input_format == "AlphaPept":
             input_data_frame = pd.read_csv(input_csv, low_memory=False)
+            input_data_frame["proforma"] = input_data_frame["sequence"].apply(
+                get_proforma_alphapept
+            )
         elif input_format == "Sage":
             input_data_frame = pd.read_csv(input_csv, sep="\t", low_memory=False)
+            input_data_frame["proforma"] = input_data_frame["peptide"].apply(
+                get_proforma_sage
+            )
         elif input_format == "MSFragger":
             input_data_frame = pd.read_csv(input_csv, low_memory=False, sep="\t")
+            input_data_frame["proforma"] = input_data_frame["Modified Sequence"].apply(
+                get_proforma_msfragger
+            )
         elif input_format == "WOMBAT":
             input_data_frame = pd.read_csv(input_csv, low_memory=False, sep=",")
+            input_data_frame["proforma"] = input_data_frame["modified_peptide"]
             input_data_frame["Sequence"] = input_data_frame["modified_peptide"].apply(
                 self.strip_sequence_wombat
             )

diff --git a/proteobench/modules/dda_quant/parse.py b/proteobench/modules/dda_quant/parse.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import re
 from typing import Dict, List
 
 import pandas as pd
@@ -8,6 +9,128 @@
 from proteobench.modules.interfaces import ParseInputsInterface
 
 
+def count_chars(input_string):
+    return sum(1 for char in input_string if char.isalpha() and char.isupper())
+
+
+def match_brackets(input_string):
+    pattern = r"\[([^]]+)\]"
+    matches = [
+        (match.group(1), match.start(1), match.end(1))
+        for match in re.finditer(pattern, input_string)
+    ]
+    positions = (count_chars(input_string[0 : m[1]]) for m in matches)
+    mods = (m[0] for m in matches)
+    return mods, positions
+
+
+def get_proforma_sage(
+    input_string,
+    modification_dict={
+        "+57.0215": "Carbamidomethyl",
+        "+15.9949": "Oxidation",
+        "-17.026548": "Gln->pyro-Glu",
+        "-18.010565": "Glu->pyro-Glu",
+        "+42": "Acetyl",
+    },
+):
+    modifications, positions = match_brackets(input_string)
+
+    new_modifications = []
+    for m in modifications:
+        try:
+            new_modifications.append(modification_dict[m])
+        except KeyError:
+            new_modifications.append("")
+    modifications = new_modifications
+
+    pos_mod_dict = dict(zip(positions, modifications))
+
+    stripped_seq = "".join(
+        char for char in input_string if char.isalpha() and char.isupper()
+    )
+
+    new_seq = ""
+    for idx, aa in enumerate(stripped_seq):
+        if idx in pos_mod_dict.keys():
+            new_seq += f"[{pos_mod_dict[idx]}]"
+        new_seq += aa
+
+    return new_seq
+
+
+def get_proforma_msfragger(
+    input_string,
+    modification_dict={
+        "57.0215": "Carbamidomethyl",
+        "15.9949": "Oxidation",
+        "-17.026548": "Gln->pyro-Glu",
+        "-18.010565": "Glu->pyro-Glu",
+        "42.0106": "Acetyl",
+    },
+):
+    modifications, positions = match_brackets(input_string)
+
+    new_modifications = []
+    for m in modifications:
+        try:
+            new_modifications.append(modification_dict[m])
+        except KeyError:
+            new_modifications.append("")
+    modifications = new_modifications
+
+    pos_mod_dict = dict(zip(positions, modifications))
+
+    stripped_seq = "".join(
+        char for char in input_string if char.isalpha() and char.isupper()
+    )
+
+    new_seq = ""
+    for idx, aa in enumerate(stripped_seq):
+        if idx in pos_mod_dict.keys():
+            new_seq += f"[{pos_mod_dict[idx]}]"
+        new_seq += aa
+
+    return new_seq
+
+
+def get_proforma_alphapept(
+    input_string,
+    modification_dict={
+        "ox": "Oxidation",
+        "c": "Carbamidomethyl",
+        "a": "Acetyl",
+        "decoy": "",
+    },
+):
+    modifications, positions = match_seq(input_string, pattern=re.compile(r"([a-z]+)"))
+    modifications = (modification_dict[m] for m in modifications)
+    pos_mod_dict = dict(zip(positions, modifications))
+
+    stripped_seq = "".join(char for char in input_string if not char.islower())
+
+    new_seq = ""
+    for idx, aa in enumerate(stripped_seq):
+        new_seq += aa
+        if idx in pos_mod_dict.keys():
+            new_seq += f"[{pos_mod_dict[idx]}]"
+    return new_seq
+
+
+def count_upper_chars(input_string):
+    return sum(1 for char in input_string if char.isupper())
+
+
+def match_seq(input_string, pattern=re.compile(r"([a-z]+)")):
+    matches = [
+        (match.group(1), match.start(1), match.end(1))
+        for match in pattern.finditer(input_string)
+    ]
+    positions = (count_upper_chars(input_string[0 : m[1]]) for m in matches)
+    mods = (m[0] for m in matches)
+    return mods, positions
+
+
 class ParseInputs(ParseInputsInterface):
     def convert_to_standard_format(
         self, df: pd.DataFrame, parse_settings: ParseSettings
@@ -60,10 +183,15 @@ def convert_to_standard_format(
         df = pd.concat([df, pd.get_dummies(df["Raw file"])], axis=1)
 
         # TODO, if "Charge" is not available return a sensible error
-        # TODO, include modifications for ion
-        df.loc[df.index, "peptidoform"] = (
-            df.loc[df.index, "proforma"] + "|Z=" + df.loc[df.index, "Charge"].map(str)
-        )
+        try:
+            df.loc[df.index, "peptidoform"] = (
+                df.loc[df.index, "proforma"]
+                + "|Z="
+                + df.loc[df.index, "Charge"].map(str)
+            )
+        except KeyError:
+            # TODO if charge is not available it is now replaced with 2
+            df.loc[df.index, "peptidoform"] = df.loc[df.index, "proforma"] + "|Z=2"
 
         # TODO use peptide_ion or peptidoform here
         # TODO move this to datapoint, keep a count here of quantified AA

diff --git a/pyproject.toml b/pyproject.toml
@@ -20,6 +20,7 @@ dependencies = [
   "matplotlib",
   "importlib-metadata; python_version < '3.8'",
   "toml",
+  "psm_utils",
 ]
 dynamic = ["version", "description"]
 keywords = ['proteomics', 'peptides', 'retention time', 'mass spectrometry']