Skip to content

Commit

Permalink
Add support for alphapept, sage, msfragger, and wombat
Browse files Browse the repository at this point in the history
  • Loading branch information
RobbinBouwmeester committed Nov 23, 2023
1 parent 4d4f87e commit 877a471
Show file tree
Hide file tree
Showing 3 changed files with 149 additions and 5 deletions.
17 changes: 16 additions & 1 deletion proteobench/modules/dda_quant/module.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,12 @@

from proteobench.github.gh import clone_repo, pr_github, read_results_json_repo
from proteobench.modules.dda_quant.datapoint import Datapoint
from proteobench.modules.dda_quant.parse import ParseInputs
from proteobench.modules.dda_quant.parse import (
ParseInputs,
get_proforma_alphapept,
get_proforma_msfragger,
get_proforma_sage,
)
from proteobench.modules.dda_quant.parse_settings import (
DDA_QUANT_RESULTS_REPO,
ParseSettings,
Expand Down Expand Up @@ -174,12 +179,22 @@ def load_input_file(self, input_csv: str, input_format: str) -> pd.DataFrame:

elif input_format == "AlphaPept":
input_data_frame = pd.read_csv(input_csv, low_memory=False)
input_data_frame["proforma"] = input_data_frame["sequence"].apply(
get_proforma_alphapept
)
elif input_format == "Sage":
input_data_frame = pd.read_csv(input_csv, sep="\t", low_memory=False)
input_data_frame["proforma"] = input_data_frame["peptide"].apply(
get_proforma_sage
)
elif input_format == "MSFragger":
input_data_frame = pd.read_csv(input_csv, low_memory=False, sep="\t")
input_data_frame["proforma"] = input_data_frame["Modified Sequence"].apply(
get_proforma_msfragger
)
elif input_format == "WOMBAT":
input_data_frame = pd.read_csv(input_csv, low_memory=False, sep=",")
input_data_frame["proforma"] = input_data_frame["modified_peptide"]
input_data_frame["Sequence"] = input_data_frame["modified_peptide"].apply(
self.strip_sequence_wombat
)
Expand Down
136 changes: 132 additions & 4 deletions proteobench/modules/dda_quant/parse.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations

import re
from typing import Dict, List

import pandas as pd
Expand All @@ -8,6 +9,128 @@
from proteobench.modules.interfaces import ParseInputsInterface


def count_chars(input_string):
return sum(1 for char in input_string if char.isalpha() and char.isupper())


def match_brackets(input_string):
pattern = r"\[([^]]+)\]"
matches = [
(match.group(1), match.start(1), match.end(1))
for match in re.finditer(pattern, input_string)
]
positions = (count_chars(input_string[0 : m[1]]) for m in matches)
mods = (m[0] for m in matches)
return mods, positions


def get_proforma_sage(
input_string,
modification_dict={
"+57.0215": "Carbamidomethyl",
"+15.9949": "Oxidation",
"-17.026548": "Gln->pyro-Glu",
"-18.010565": "Glu->pyro-Glu",
"+42": "Acetyl",
},
):
modifications, positions = match_brackets(input_string)

new_modifications = []
for m in modifications:
try:
new_modifications.append(modification_dict[m])
except KeyError:
new_modifications.append("")
modifications = new_modifications

pos_mod_dict = dict(zip(positions, modifications))

stripped_seq = "".join(
char for char in input_string if char.isalpha() and char.isupper()
)

new_seq = ""
for idx, aa in enumerate(stripped_seq):
if idx in pos_mod_dict.keys():
new_seq += f"[{pos_mod_dict[idx]}]"
new_seq += aa

return new_seq


def get_proforma_msfragger(
input_string,
modification_dict={
"57.0215": "Carbamidomethyl",
"15.9949": "Oxidation",
"-17.026548": "Gln->pyro-Glu",
"-18.010565": "Glu->pyro-Glu",
"42.0106": "Acetyl",
},
):
modifications, positions = match_brackets(input_string)

new_modifications = []
for m in modifications:
try:
new_modifications.append(modification_dict[m])
except KeyError:
new_modifications.append("")
modifications = new_modifications

pos_mod_dict = dict(zip(positions, modifications))

stripped_seq = "".join(
char for char in input_string if char.isalpha() and char.isupper()
)

new_seq = ""
for idx, aa in enumerate(stripped_seq):
if idx in pos_mod_dict.keys():
new_seq += f"[{pos_mod_dict[idx]}]"
new_seq += aa

return new_seq


def get_proforma_alphapept(
input_string,
modification_dict={
"ox": "Oxidation",
"c": "Carbamidomethyl",
"a": "Acetyl",
"decoy": "",
},
):
modifications, positions = match_seq(input_string, pattern=re.compile(r"([a-z]+)"))
modifications = (modification_dict[m] for m in modifications)
pos_mod_dict = dict(zip(positions, modifications))

stripped_seq = "".join(char for char in input_string if not char.islower())

new_seq = ""
for idx, aa in enumerate(stripped_seq):
new_seq += aa
if idx in pos_mod_dict.keys():
new_seq += f"[{pos_mod_dict[idx]}]"
return new_seq


def count_upper_chars(input_string):
return sum(1 for char in input_string if char.isupper())


def match_seq(input_string, pattern=re.compile(r"([a-z]+)")):
matches = [
(match.group(1), match.start(1), match.end(1))
for match in pattern.finditer(input_string)
]
positions = (count_upper_chars(input_string[0 : m[1]]) for m in matches)
mods = (m[0] for m in matches)
return mods, positions


class ParseInputs(ParseInputsInterface):
def convert_to_standard_format(
self, df: pd.DataFrame, parse_settings: ParseSettings
Expand Down Expand Up @@ -60,10 +183,15 @@ def convert_to_standard_format(
df = pd.concat([df, pd.get_dummies(df["Raw file"])], axis=1)

# TODO, if "Charge" is not available return a sensible error
# TODO, include modifications for ion
df.loc[df.index, "peptidoform"] = (
df.loc[df.index, "proforma"] + "|Z=" + df.loc[df.index, "Charge"].map(str)
)
try:
df.loc[df.index, "peptidoform"] = (
df.loc[df.index, "proforma"]
+ "|Z="
+ df.loc[df.index, "Charge"].map(str)
)
except KeyError:
# TODO if charge is not available it is now replaced with 2
df.loc[df.index, "peptidoform"] = df.loc[df.index, "proforma"] + "|Z=2"

# TODO use peptide_ion or peptidoform here
# TODO move this to datapoint, keep a count here of quantified AA
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ dependencies = [
"matplotlib",
"importlib-metadata; python_version < '3.8'",
"toml",
"psm_utils",
]
dynamic = ["version", "description"]
keywords = ['proteomics', 'peptides', 'retention time', 'mass spectrometry']
Expand Down

0 comments on commit 877a471

Please sign in to comment.