Skip to content

Commit

Permalink
Merge pull request #10 from bigbio/dev
Browse files Browse the repository at this point in the history
boad added
  • Loading branch information
ypriverol authored Jul 30, 2024
2 parents bef95d5 + 066773c commit 332589e
Show file tree
Hide file tree
Showing 9 changed files with 49 additions and 36 deletions.
1 change: 1 addition & 0 deletions .github/workflows/conda-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,4 +47,5 @@ jobs:
run: |
conda activate quantmsutils
quantmsutilsc --help
quantmsutilsc --version
shell: bash -l {0}
2 changes: 1 addition & 1 deletion environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ channels:
- conda-forge
dependencies:
- click
- sdrf-pipelines
- sdrf-pipelines>=0.0.28
- pyopenms
- ms2rescore=3.0.2
- psm-utils=0.8.0
Expand Down
1 change: 1 addition & 0 deletions quantmsutils/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
__version__ = "0.0.3"
49 changes: 24 additions & 25 deletions quantmsutils/diann/diann2mztab.py
Original file line number Diff line number Diff line change
Expand Up @@ -362,7 +362,7 @@ def diann_version(self) -> str:
return diann_version_id

def validate_diann_version(self) -> None:
supported_diann_versions = ["1.8.1"]
supported_diann_versions = ["1.8.1", "1.9.beta.1"]
if self.diann_version not in supported_diann_versions:
raise ValueError(f"Unsupported DIANN version {self.diann_version}")

Expand Down Expand Up @@ -732,7 +732,7 @@ def mztab_prh(report, pg, index_ref, database, fasta_df):

logger.debug("Classifying results type ...")
pg["opt_global_result_type"] = "single_protein"
pg.loc[pg["Protein.Ids"].str.contains(";"), "opt_global_result_type"] = (
pg.loc[pg["Protein.Group"].str.contains(";"), "opt_global_result_type"] = (
"indistinguishable_protein_group"
)

Expand All @@ -741,7 +741,6 @@ def mztab_prh(report, pg, index_ref, database, fasta_df):
out_mztab_prh = out_mztab_prh.drop(["Protein.Names"], axis=1)
out_mztab_prh.rename(
columns={
"Protein.Group": "accession",
"First.Protein.Description": "description",
},
inplace=True,
Expand All @@ -762,14 +761,14 @@ def mztab_prh(report, pg, index_ref, database, fasta_df):

logger.debug("Extracting accession values (keeping first)...")
out_mztab_prh.loc[:, "accession"] = out_mztab_prh.apply(
lambda x: x["accession"].split(";")[0], axis=1
lambda x: x["Protein.Group"].split(";")[0], axis=1
)

protein_details_df = out_mztab_prh[
out_mztab_prh["opt_global_result_type"] == "indistinguishable_protein_group"
]
prh_series = (
protein_details_df["Protein.Ids"]
protein_details_df["Protein.Group"]
.str.split(";", expand=True)
.stack()
.reset_index(level=1, drop=True)
Expand Down Expand Up @@ -806,7 +805,7 @@ def mztab_prh(report, pg, index_ref, database, fasta_df):
# or out_mztab_PRH.loc[out_mztab_PRH["Protein.Ids"] == out_mztab_PRH["accession"], "ambiguity_members"] = "null"
out_mztab_prh.loc[:, "ambiguity_members"] = out_mztab_prh.apply(
lambda x: (
x["Protein.Ids"]
x["Protein.Group"]
if x["opt_global_result_type"] == "indistinguishable_protein_group"
else "null"
),
Expand All @@ -817,7 +816,7 @@ def mztab_prh(report, pg, index_ref, database, fasta_df):
score_looker = ModScoreLooker(report)
out_mztab_prh[["modifiedSequence", "best_search_engine_score[1]"]] = (
out_mztab_prh.apply(
lambda x: score_looker.get_score(x["Protein.Ids"]),
lambda x: score_looker.get_score(x["Protein.Group"]),
axis=1,
result_type="expand",
)
Expand All @@ -833,19 +832,19 @@ def mztab_prh(report, pg, index_ref, database, fasta_df):
# This used to be a bottleneck in performance
# This implementation drops the run time from 57s to 25ms
protein_agg_report = (
report[["PG.MaxLFQ", "Protein.Ids", "study_variable"]]
.groupby(["study_variable", "Protein.Ids"])
report[["PG.MaxLFQ", "Protein.Group", "study_variable"]]
.groupby(["study_variable", "Protein.Group"])
.agg({"PG.MaxLFQ": ["mean", "std", "sem"]})
.reset_index()
.pivot(columns=["study_variable"], index="Protein.Ids")
.pivot(columns=["study_variable"], index="Protein.Group")
.reset_index()
)
protein_agg_report.columns = [
"::".join([str(s) for s in col]).strip()
for col in protein_agg_report.columns.values
]
subname_mapper = {
"Protein.Ids::::": "Protein.Ids",
"Protein.Group::::": "Protein.Group",
"PG.MaxLFQ::mean": "protein_abundance_study_variable",
"PG.MaxLFQ::std": "protein_abundance_stdev_study_variable",
"PG.MaxLFQ::sem": "protein_abundance_std_error_study_variable",
Expand All @@ -858,7 +857,7 @@ def mztab_prh(report, pg, index_ref, database, fasta_df):
# to the Protein.Ids (A0A024RBG1;Q9NZJ9;Q9NZJ9-2), leading to A LOT of missing values.
out_mztab_prh = out_mztab_prh.merge(
protein_agg_report,
on="Protein.Ids",
on="Protein.Group",
how="left",
validate="many_to_one",
copy=True,
Expand All @@ -871,7 +870,7 @@ def mztab_prh(report, pg, index_ref, database, fasta_df):
out_mztab_prh.loc[:, "PRH"] = "PRT"
index = out_mztab_prh.loc[:, "PRH"]
out_mztab_prh.drop(
["PRH", "Genes", "modifiedSequence", "Protein.Ids"], axis=1, inplace=True
["PRH", "Genes", "modifiedSequence", "Protein.Group"], axis=1, inplace=True
)
out_mztab_prh.insert(0, "PRH", index)
out_mztab_prh.fillna("null", inplace=True)
Expand Down Expand Up @@ -916,14 +915,14 @@ def mztab_peh(
out_mztab_peh = pd.DataFrame()
out_mztab_peh = pr.iloc[:, 0:10]
out_mztab_peh.drop(
["Protein.Group", "Protein.Names", "First.Protein.Description", "Proteotypic"],
["Protein.Ids", "Protein.Names", "First.Protein.Description", "Proteotypic"],
axis=1,
inplace=True,
)
out_mztab_peh.rename(
columns={
"Stripped.Sequence": "sequence",
"Protein.Ids": "accession",
"Protein.Group": "accession",
"Modified.Sequence": "opt_global_cv_MS:1000889_peptidoform_sequence",
"Precursor.Charge": "charge",
},
Expand Down Expand Up @@ -1123,7 +1122,7 @@ def __find_info(directory, n):
out_mztab_psh = out_mztab_psh[
[
"Stripped.Sequence",
"Protein.Ids",
"Protein.Group",
"Q.Value",
"RT.Start",
"Precursor.Charge",
Expand Down Expand Up @@ -1239,7 +1238,7 @@ def classify_result_type(target):
:return: A string implys protein type
:rtype: str
"""
if ";" in target["Protein.Ids"]:
if ";" in target["Protein.Group"]:
return "indistinguishable_protein_group"
return "single_protein"

Expand Down Expand Up @@ -1293,7 +1292,7 @@ def match_in_report(report, target, max_, flag, level):
return tuple(q_value)

if flag == 1 and level == "protein":
result = report[report["Protein.Ids"] == target]
result = report[report["Protein.Group"] == target]
prh_params = []
for i in range(1, max_ + 1):
match = result[result["study_variable"] == i]
Expand All @@ -1320,9 +1319,9 @@ def __init__(self, report: pd.DataFrame) -> None:

def make_lookup_dict(self, report) -> Dict[str, Tuple[str, float]]:
grouped_df = (
report[["Modified.Sequence", "Protein.Ids", "Global.PG.Q.Value"]]
report[["Modified.Sequence", "Protein.Group", "Global.PG.Q.Value"]]
.sort_values("Global.PG.Q.Value", ascending=True)
.groupby(["Protein.Ids"])
.groupby(["Protein.Group"])
.head(1)
)
# Modified.Sequence Protein.Ids Global.PG.Q.Value
Expand All @@ -1332,7 +1331,7 @@ def make_lookup_dict(self, report) -> Dict[str, Tuple[str, float]]:
# 103588 NPVGYPLAWQFLR Q9NZ08;Q9NZ08-2 0.000252

out = {
row["Protein.Ids"]: (row["Modified.Sequence"], row["Global.PG.Q.Value"])
row["Protein.Group"]: (row["Modified.Sequence"], row["Global.PG.Q.Value"])
for _, row in grouped_df.iterrows()
}
return out
Expand Down Expand Up @@ -1578,17 +1577,17 @@ def calculate_protein_coverages(
protein in the PRH table (defined by accession, not protein.ids).
"""
nested_df = (
report[["Protein.Ids", "Stripped.Sequence"]]
.groupby("Protein.Ids")
report[["Protein.Group", "Stripped.Sequence"]]
.groupby("Protein.Group")
.agg({"Stripped.Sequence": set})
.reset_index()
)
# Protein.Ids Stripped.Sequence
# 0 A0A024RBG1;Q9NZJ9;Q9NZJ9-2 {SEQEDEVLLVSSSR}
# 1 A0A096LP49;A0A096LP49-2 {SPWAMTERKHSSLER}
# 2 A0AVT1;A0AVT1-2 {EDFTLLDFINAVK, KPDHVPISSEDER, QDVIITALDNVEAR,...
ids_to_seqs = dict(zip(nested_df["Protein.Ids"], nested_df["Stripped.Sequence"]))
acc_to_ids = dict(zip(out_mztab_prh["accession"], out_mztab_prh["Protein.Ids"]))
ids_to_seqs = dict(zip(nested_df["Protein.Group"], nested_df["Stripped.Sequence"]))
acc_to_ids = dict(zip(out_mztab_prh["accession"], out_mztab_prh["Protein.Group"]))
fasta_id_to_seqs = dict(zip(fasta_df["id"], fasta_df["seq"]))
acc_to_fasta_ids: dict = {}

Expand Down
8 changes: 5 additions & 3 deletions quantmsutils/psm/psm_conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
"modifications",
"retention_time",
"charge",
"calc_mass_to_charge",
"exp_mass_to_charge",
"reference_file_name",
"scan_number",
"peptidoform",
Expand All @@ -33,6 +33,8 @@


def mods_position(peptide):
if peptide.startswith("."):
peptide = peptide[1:]
pattern = re.compile(r"\((.*?)\)")
original_mods = pattern.findall(peptide)
peptide = re.sub(r"\(.*?\)", ".", peptide)
Expand Down Expand Up @@ -95,7 +97,7 @@ def convert_psm(ctx, idxml: str, spectra_file: str, export_decoy_psm: bool = Fal

for peptide_id in pep_ids:
retention_time = peptide_id.getRT()
calc_mass_to_charge = peptide_id.getMZ()
exp_mass_to_charge = peptide_id.getMZ()
scan_number = int(
re.findall(
r"(spectrum|scan)=(\d+)", peptide_id.getMetaValue("spectrum_reference")
Expand Down Expand Up @@ -153,7 +155,7 @@ def convert_psm(ctx, idxml: str, spectra_file: str, export_decoy_psm: bool = Fal
modifications,
retention_time,
charge,
calc_mass_to_charge,
exp_mass_to_charge,
reference_file_name,
scan_number,
peptidoform,
Expand Down
7 changes: 2 additions & 5 deletions quantmsutils/quantmsutilsc.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,27 +8,24 @@
from quantmsutils.rescoring.ms2rescore import ms2rescore
from quantmsutils.sdrf.check_samplesheet import check_samplesheet
from quantmsutils.sdrf.extract_sample import extract_sample_from_expdesign
from quantmsutils import __version__

CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"])


@click.version_option(version=__version__, package_name="quantmsutils", message="%(package)s %(version)s")
@click.group(context_settings=CONTEXT_SETTINGS)
def cli():
pass


cli.add_command(dianncfg)
cli.add_command(diann2mztab)

cli.add_command(add_sage_feature)

cli.add_command(mzml_statistics)

cli.add_command(extract_sample_from_expdesign)
cli.add_command(check_samplesheet)

cli.add_command(ms2rescore)

cli.add_command(convert_psm)


Expand Down
13 changes: 13 additions & 0 deletions quantmsutils/rescoring/ms2rescore.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
def parse_cli_arguments_to_config(
config_file: str = None,
feature_generators: str = None,
ms2pip_model_dir: str = None,
ms2pip_model: str = None,
ms2_tolerance: float = None,
calibration_set_size: float = None,
Expand Down Expand Up @@ -46,6 +47,7 @@ def parse_cli_arguments_to_config(
config["ms2rescore"]["feature_generators"]["basic"] = {}
if "ms2pip" in feature_generators_list:
config["ms2rescore"]["feature_generators"]["ms2pip"] = {
"model_dir": ms2pip_model_dir,
"model": ms2pip_model,
"ms2_tolerance": ms2_tolerance,
}
Expand Down Expand Up @@ -76,6 +78,8 @@ def parse_cli_arguments_to_config(
"Percolator rescoring engine has been specified. Use the idXML containing rescoring features and run Percolator in a separate step."
)

if ms2pip_model_dir is not None:
config["ms2rescore"]["ms2pip_model_dir"] = ms2pip_model_dir
if ms2pip_model is not None:
config["ms2rescore"]["ms2pip_model"] = ms2pip_model
if ms2_tolerance is not None:
Expand Down Expand Up @@ -216,6 +220,13 @@ def filter_out_artifact_psms(
type=str,
default="Immuno-HCD",
)
@click.option(
"-md",
"--ms2pip_model_dir",
help="The path of MS²PIP model (default: `./`)",
type=str,
default="./"
)
@click.option(
"-ms2tol",
"--ms2_tolerance",
Expand Down Expand Up @@ -271,6 +282,7 @@ def ms2rescore(
fasta_file,
test_fdr,
feature_generators,
ms2pip_model_dir,
ms2pip_model,
ms2_tolerance,
calibration_set_size,
Expand Down Expand Up @@ -318,6 +330,7 @@ def ms2rescore(
config_file=config_file,
output_path=output_path,
feature_generators=feature_generators,
ms2pip_model_dir=ms2pip_model_dir,
ms2pip_model=ms2pip_model,
processes=processes,
ms2_tolerance=ms2_tolerance,
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
click
sdrf-pipelines
sdrf-pipelines==0.0.28
pyopenms
ms2rescore==3.0.2
psm-utils==0.8.0
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@

INSTALL_REQUIRES = [
"click",
"sdrf-pipelines",
"sdrf-pipelines==0.0.28",
"pyopenms",
"ms2rescore==3.0.2",
"psm-utils==0.8.0",
Expand Down

0 comments on commit 332589e

Please sign in to comment.