diff --git a/proteobench/modules/dda_quant/module.py b/proteobench/modules/dda_quant/module.py index 1f857c63..8638f63f 100644 --- a/proteobench/modules/dda_quant/module.py +++ b/proteobench/modules/dda_quant/module.py @@ -10,11 +10,17 @@ import numpy as np import pandas as pd +import psm_utils.io.maxquant as maxquant import streamlit as st from proteobench.github.gh import clone_repo, pr_github, read_results_json_repo from proteobench.modules.dda_quant.datapoint import Datapoint -from proteobench.modules.dda_quant.parse import ParseInputs +from proteobench.modules.dda_quant.parse import ( + ParseInputs, + get_proforma_alphapept, + get_proforma_msfragger, + get_proforma_sage, +) from proteobench.modules.dda_quant.parse_settings import ( DDA_QUANT_RESULTS_REPO, ParseSettings, @@ -166,14 +172,33 @@ def load_input_file(self, input_csv: str, input_format: str) -> pd.DataFrame: if input_format == "MaxQuant": input_data_frame = pd.read_csv(input_csv, sep="\t", low_memory=False) + input_data_frame["proforma"] = [ + maxquant.MSMSReader._parse_peptidoform(mod_seq, z).proforma.split("/")[ + 0 + ] + for mod_seq, z in input_data_frame[ + ["Modified sequence", "Charge"] + ].values.tolist() + ] + elif input_format == "AlphaPept": input_data_frame = pd.read_csv(input_csv, low_memory=False) + input_data_frame["proforma"] = input_data_frame["sequence"].apply( + get_proforma_alphapept + ) elif input_format == "Sage": input_data_frame = pd.read_csv(input_csv, sep="\t", low_memory=False) + input_data_frame["proforma"] = input_data_frame["peptide"].apply( + get_proforma_sage + ) elif input_format == "MSFragger": input_data_frame = pd.read_csv(input_csv, low_memory=False, sep="\t") + input_data_frame["proforma"] = input_data_frame["Modified Sequence"].apply( + get_proforma_msfragger + ) elif input_format == "WOMBAT": input_data_frame = pd.read_csv(input_csv, low_memory=False, sep=",") + input_data_frame["proforma"] = input_data_frame["modified_peptide"] input_data_frame["Sequence"] = input_data_frame["modified_peptide"].apply( self.strip_sequence_wombat ) diff --git a/proteobench/modules/dda_quant/parse.py b/proteobench/modules/dda_quant/parse.py index 96c345d0..ff7f98e8 100644 --- a/proteobench/modules/dda_quant/parse.py +++ b/proteobench/modules/dda_quant/parse.py @@ -1,5 +1,6 @@ from __future__ import annotations +import re from typing import Dict, List import pandas as pd @@ -8,18 +9,140 @@ from proteobench.modules.interfaces import ParseInputsInterface +def count_chars(input_string): + return sum(1 for char in input_string if char.isalpha() and char.isupper()) + + +def match_brackets(input_string): + pattern = r"\[([^]]+)\]" + matches = [ + (match.group(1), match.start(1), match.end(1)) + for match in re.finditer(pattern, input_string) + ] + positions = (count_chars(input_string[0 : m[1]]) for m in matches) + mods = (m[0] for m in matches) + return mods, positions + + +def get_proforma_sage( + input_string, + modification_dict={ + "+57.0215": "Carbamidomethyl", + "+15.9949": "Oxidation", + "-17.026548": "Gln->pyro-Glu", + "-18.010565": "Glu->pyro-Glu", + "+42": "Acetyl", + }, +): + modifications, positions = match_brackets(input_string) + + new_modifications = [] + for m in modifications: + try: + new_modifications.append(modification_dict[m]) + except KeyError: + new_modifications.append("") + modifications = new_modifications + + pos_mod_dict = dict(zip(positions, modifications)) + + stripped_seq = "".join( + char for char in input_string if char.isalpha() and char.isupper() + ) + + new_seq = "" + for idx, aa in enumerate(stripped_seq): + if idx in pos_mod_dict.keys(): + new_seq += f"[{pos_mod_dict[idx]}]" + new_seq += aa + + return new_seq + + +def get_proforma_msfragger( + input_string, + modification_dict={ + "57.0215": "Carbamidomethyl", + "15.9949": "Oxidation", + "-17.026548": "Gln->pyro-Glu", + "-18.010565": "Glu->pyro-Glu", + "42.0106": "Acetyl", + }, +): + modifications, positions = match_brackets(input_string) + + new_modifications = [] + for m in modifications: + try: + new_modifications.append(modification_dict[m]) + except KeyError: + new_modifications.append("") + modifications = new_modifications + + pos_mod_dict = dict(zip(positions, modifications)) + + stripped_seq = "".join( + char for char in input_string if char.isalpha() and char.isupper() + ) + + new_seq = "" + for idx, aa in enumerate(stripped_seq): + if idx in pos_mod_dict.keys(): + new_seq += f"[{pos_mod_dict[idx]}]" + new_seq += aa + + return new_seq + + +def get_proforma_alphapept( + input_string, + modification_dict={ + "ox": "Oxidation", + "c": "Carbamidomethyl", + "a": "Acetyl", + "decoy": "", + }, +): + modifications, positions = match_seq(input_string, pattern=re.compile(r"([a-z]+)")) + modifications = (modification_dict[m] for m in modifications) + pos_mod_dict = dict(zip(positions, modifications)) + + stripped_seq = "".join(char for char in input_string if not char.islower()) + + new_seq = "" + for idx, aa in enumerate(stripped_seq): + new_seq += aa + if idx in pos_mod_dict.keys(): + new_seq += f"[{pos_mod_dict[idx]}]" + return new_seq + + +def count_upper_chars(input_string): + return sum(1 for char in input_string if char.isupper()) + + +def match_seq(input_string, pattern=re.compile(r"([a-z]+)")): + matches = [ + (match.group(1), match.start(1), match.end(1)) + for match in pattern.finditer(input_string) + ] + positions = (count_upper_chars(input_string[0 : m[1]]) for m in matches) + mods = (m[0] for m in matches) + return mods, positions + + class ParseInputs(ParseInputsInterface): def convert_to_standard_format( self, df: pd.DataFrame, parse_settings: ParseSettings ) -> tuple[pd.DataFrame, Dict[int, List[str]]]: - """Convert a search engine output into a generic format supported by the module.""" - #TODO add functionality/steps in docstring + """Convert a software tool output into a generic format supported by the module.""" + # TODO add functionality/steps in docstring for k, v in parse_settings.mapper.items(): if k not in df.columns: raise ImportError( f"Column {k} not found in input dataframe." - " Please check input file and selected search engine." + " Please check input file and selected software tool." ) df.rename(columns=parse_settings.mapper, inplace=True) @@ -60,8 +183,15 @@ def convert_to_standard_format( df = pd.concat([df, pd.get_dummies(df["Raw file"])], axis=1) # TODO, if "Charge" is not available return a sensible error - # TODO, include modifications for ion - df.loc[df.index, "peptidoform"] = df.loc[df.index, "Sequence"]+"|Z="+df.loc[df.index, "Charge"].map(str) + try: + df.loc[df.index, "peptidoform"] = ( + df.loc[df.index, "proforma"] + + "|Z=" + + df.loc[df.index, "Charge"].map(str) + ) + except KeyError: + # TODO if charge is not available it is now replaced with 2 + df.loc[df.index, "peptidoform"] = df.loc[df.index, "proforma"] + "|Z=2" # TODO use peptide_ion or peptidoform here # TODO move this to datapoint, keep a count here of quantified AA diff --git a/pyproject.toml b/pyproject.toml index 78edaab5..af8d7d95 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,6 +20,7 @@ dependencies = [ "matplotlib", "importlib-metadata; python_version < '3.8'", "toml", + "psm_utils", ] dynamic = ["version", "description"] keywords = ['proteomics', 'peptides', 'retention time', 'mass spectrometry']