Skip to content

Commit

Permalink
Merge pull request #132 from Proteobench/add_modification_to_ion
Browse files Browse the repository at this point in the history
Add modifications to ion
  • Loading branch information
wolski authored Nov 23, 2023
2 parents e509ecf + 6c64c1d commit c2549b5
Show file tree
Hide file tree
Showing 3 changed files with 162 additions and 6 deletions.
27 changes: 26 additions & 1 deletion proteobench/modules/dda_quant/module.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,17 @@

import numpy as np
import pandas as pd
import psm_utils.io.maxquant as maxquant
import streamlit as st

from proteobench.github.gh import clone_repo, pr_github, read_results_json_repo
from proteobench.modules.dda_quant.datapoint import Datapoint
from proteobench.modules.dda_quant.parse import ParseInputs
from proteobench.modules.dda_quant.parse import (
ParseInputs,
get_proforma_alphapept,
get_proforma_msfragger,
get_proforma_sage,
)
from proteobench.modules.dda_quant.parse_settings import (
DDA_QUANT_RESULTS_REPO,
ParseSettings,
Expand Down Expand Up @@ -166,14 +172,33 @@ def load_input_file(self, input_csv: str, input_format: str) -> pd.DataFrame:

if input_format == "MaxQuant":
input_data_frame = pd.read_csv(input_csv, sep="\t", low_memory=False)
input_data_frame["proforma"] = [
maxquant.MSMSReader._parse_peptidoform(mod_seq, z).proforma.split("/")[
0
]
for mod_seq, z in input_data_frame[
["Modified sequence", "Charge"]
].values.tolist()
]

elif input_format == "AlphaPept":
input_data_frame = pd.read_csv(input_csv, low_memory=False)
input_data_frame["proforma"] = input_data_frame["sequence"].apply(
get_proforma_alphapept
)
elif input_format == "Sage":
input_data_frame = pd.read_csv(input_csv, sep="\t", low_memory=False)
input_data_frame["proforma"] = input_data_frame["peptide"].apply(
get_proforma_sage
)
elif input_format == "MSFragger":
input_data_frame = pd.read_csv(input_csv, low_memory=False, sep="\t")
input_data_frame["proforma"] = input_data_frame["Modified Sequence"].apply(
get_proforma_msfragger
)
elif input_format == "WOMBAT":
input_data_frame = pd.read_csv(input_csv, low_memory=False, sep=",")
input_data_frame["proforma"] = input_data_frame["modified_peptide"]
input_data_frame["Sequence"] = input_data_frame["modified_peptide"].apply(
self.strip_sequence_wombat
)
Expand Down
140 changes: 135 additions & 5 deletions proteobench/modules/dda_quant/parse.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations

import re
from typing import Dict, List

import pandas as pd
Expand All @@ -8,18 +9,140 @@
from proteobench.modules.interfaces import ParseInputsInterface


def count_chars(input_string):
return sum(1 for char in input_string if char.isalpha() and char.isupper())


def match_brackets(input_string):
pattern = r"\[([^]]+)\]"
matches = [
(match.group(1), match.start(1), match.end(1))
for match in re.finditer(pattern, input_string)
]
positions = (count_chars(input_string[0 : m[1]]) for m in matches)
mods = (m[0] for m in matches)
return mods, positions


def get_proforma_sage(
input_string,
modification_dict={
"+57.0215": "Carbamidomethyl",
"+15.9949": "Oxidation",
"-17.026548": "Gln->pyro-Glu",
"-18.010565": "Glu->pyro-Glu",
"+42": "Acetyl",
},
):
modifications, positions = match_brackets(input_string)

new_modifications = []
for m in modifications:
try:
new_modifications.append(modification_dict[m])
except KeyError:
new_modifications.append("")
modifications = new_modifications

pos_mod_dict = dict(zip(positions, modifications))

stripped_seq = "".join(
char for char in input_string if char.isalpha() and char.isupper()
)

new_seq = ""
for idx, aa in enumerate(stripped_seq):
if idx in pos_mod_dict.keys():
new_seq += f"[{pos_mod_dict[idx]}]"
new_seq += aa

return new_seq


def get_proforma_msfragger(
input_string,
modification_dict={
"57.0215": "Carbamidomethyl",
"15.9949": "Oxidation",
"-17.026548": "Gln->pyro-Glu",
"-18.010565": "Glu->pyro-Glu",
"42.0106": "Acetyl",
},
):
modifications, positions = match_brackets(input_string)

new_modifications = []
for m in modifications:
try:
new_modifications.append(modification_dict[m])
except KeyError:
new_modifications.append("")
modifications = new_modifications

pos_mod_dict = dict(zip(positions, modifications))

stripped_seq = "".join(
char for char in input_string if char.isalpha() and char.isupper()
)

new_seq = ""
for idx, aa in enumerate(stripped_seq):
if idx in pos_mod_dict.keys():
new_seq += f"[{pos_mod_dict[idx]}]"
new_seq += aa

return new_seq


def get_proforma_alphapept(
input_string,
modification_dict={
"ox": "Oxidation",
"c": "Carbamidomethyl",
"a": "Acetyl",
"decoy": "",
},
):
modifications, positions = match_seq(input_string, pattern=re.compile(r"([a-z]+)"))
modifications = (modification_dict[m] for m in modifications)
pos_mod_dict = dict(zip(positions, modifications))

stripped_seq = "".join(char for char in input_string if not char.islower())

new_seq = ""
for idx, aa in enumerate(stripped_seq):
new_seq += aa
if idx in pos_mod_dict.keys():
new_seq += f"[{pos_mod_dict[idx]}]"
return new_seq


def count_upper_chars(input_string):
return sum(1 for char in input_string if char.isupper())


def match_seq(input_string, pattern=re.compile(r"([a-z]+)")):
matches = [
(match.group(1), match.start(1), match.end(1))
for match in pattern.finditer(input_string)
]
positions = (count_upper_chars(input_string[0 : m[1]]) for m in matches)
mods = (m[0] for m in matches)
return mods, positions


class ParseInputs(ParseInputsInterface):
def convert_to_standard_format(
self, df: pd.DataFrame, parse_settings: ParseSettings
) -> tuple[pd.DataFrame, Dict[int, List[str]]]:
"""Convert a search engine output into a generic format supported by the module."""
#TODO add functionality/steps in docstring
"""Convert a software tool output into a generic format supported by the module."""
# TODO add functionality/steps in docstring

for k, v in parse_settings.mapper.items():
if k not in df.columns:
raise ImportError(
f"Column {k} not found in input dataframe."
" Please check input file and selected search engine."
" Please check input file and selected software tool."
)

df.rename(columns=parse_settings.mapper, inplace=True)
Expand Down Expand Up @@ -60,8 +183,15 @@ def convert_to_standard_format(
df = pd.concat([df, pd.get_dummies(df["Raw file"])], axis=1)

# TODO, if "Charge" is not available return a sensible error
# TODO, include modifications for ion
df.loc[df.index, "peptidoform"] = df.loc[df.index, "Sequence"]+"|Z="+df.loc[df.index, "Charge"].map(str)
try:
df.loc[df.index, "peptidoform"] = (
df.loc[df.index, "proforma"]
+ "|Z="
+ df.loc[df.index, "Charge"].map(str)
)
except KeyError:
# TODO if charge is not available it is now replaced with 2
df.loc[df.index, "peptidoform"] = df.loc[df.index, "proforma"] + "|Z=2"

# TODO use peptide_ion or peptidoform here
# TODO move this to datapoint, keep a count here of quantified AA
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ dependencies = [
"matplotlib",
"importlib-metadata; python_version < '3.8'",
"toml",
"psm_utils",
]
dynamic = ["version", "description"]
keywords = ['proteomics', 'peptides', 'retention time', 'mass spectrometry']
Expand Down

0 comments on commit c2549b5

Please sign in to comment.