Skip to content

Commit

Permalink
Merge pull request #6 from bigbio/dev
Browse files Browse the repository at this point in the history
adding big tests for ms2rescore
  • Loading branch information
ypriverol authored May 29, 2024
2 parents 75fb7cf + 30f204b commit d8da3c4
Show file tree
Hide file tree
Showing 11 changed files with 127 additions and 95 deletions.
11 changes: 11 additions & 0 deletions .github/workflows/python-app.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,3 +38,14 @@ jobs:
- name: Test with pytest
run: |
pytest
- name: Download test data
run: |
wget https://ftp.pride.ebi.ac.uk/pub/databases/pride/resources/proteomes/quantms-ci-github/quantms-utils/TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01.mzML
wget https://ftp.pride.ebi.ac.uk/pub/databases/pride/resources/proteomes/quantms-ci-github/quantms-utils/TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01_comet.idXML
- name: Test percolator ms2rescore
run: |
quantmsutilsc ms2rescore -p TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01_comet.idXML -s TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01.mzML -n 2 -pipm "HCD2021" -fg "ms2pip,deeplc" --id_decoy_pattern "^rev" -t 0.05
# - name: Test makapot ms2rescore
# run: |
# quantmsutilsc ms2rescore -p TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01_comet.idXML -s TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01.mzML -n 2 -pipm "HCD2021" -fg "ms2pip,deeplc" --id_decoy_pattern "^rev" -t 0.05 --rescoring_engine "mokapot"
#
1 change: 1 addition & 0 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ dependencies:
- pydantic
- pandas
- protobuf>=3.9.2,<4 # Ensure compatibility with tensorboard
- numpy
- pip
- pip:
- tensorboard
138 changes: 69 additions & 69 deletions quantmsutils/diann/diann2mztab.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,14 +44,14 @@
@click.option("--qvalue_threshold", "-q", type=float)
@click.pass_context
def diann2mztab(
ctx,
folder,
exp_design,
dia_params,
diann_version,
charge,
missed_cleavages,
qvalue_threshold,
ctx,
folder,
exp_design,
dia_params,
diann_version,
charge,
missed_cleavages,
qvalue_threshold,
):
"""
Convert DIA-NN output to MSstats, Triqler or mzTab.
Expand All @@ -61,7 +61,7 @@ def diann2mztab(
:param folder: DiannConvert specifies the folder where the required file resides. The folder contains
the DiaNN main report, protein matrix, precursor matrix, experimental design file, protein sequence
FASTA file, version file of DiaNN and ms_info TSVs
?:param exp_design: Experimental design file
:param exp_design: Experimental design file
:param dia_params: A list contains DIA parameters
:type dia_params: list
:param diann_version: Path to a version file of DIA-NN
Expand Down Expand Up @@ -228,7 +228,7 @@ def get_exp_design_dfs(exp_design_file):
lambda x: _true_stem(x["Spectra_Filepath"]), axis=1
)

s_table = [i.replace("\n", "").split("\t") for i in data[empty_row + 1 :]][1:]
s_table = [i.replace("\n", "").split("\t") for i in data[empty_row + 1:]][1:]
s_header = data[empty_row + 1].replace("\n", "").split("\t")
s_data_frame = pd.DataFrame(s_table, columns=s_header)

Expand Down Expand Up @@ -265,31 +265,31 @@ def compute_mass_modified_peptide(peptide_seq: str) -> float:
if aa in aa_mass and not_mod:
aa = aa_mass[aa]
elif (
aa
not in [
"G",
"A",
"V",
"L",
"I",
"F",
"M",
"P",
"W",
"S",
"C",
"T",
"Y",
"N",
"Q",
"D",
"E",
"K",
"R",
"H",
]
and not_mod
and aa != ")"
aa
not in [
"G",
"A",
"V",
"L",
"I",
"F",
"M",
"P",
"W",
"S",
"C",
"T",
"Y",
"N",
"Q",
"D",
"E",
"K",
"R",
"H",
]
and not_mod
and aa != ")"
):
logger.info(f"Unknown amino acid with mass not known:{aa}")
peptide_parts.append(aa)
Expand Down Expand Up @@ -367,13 +367,13 @@ def validate_diann_version(self) -> None:
raise ValueError(f"Unsupported DIANN version {self.diann_version}")

def convert_to_mztab(
self,
report,
f_table,
charge: int,
missed_cleavages: int,
dia_params: List[Any],
out: os.PathLike,
self,
report,
f_table,
charge: int,
missed_cleavages: int,
dia_params: List[Any],
out: Union[os.PathLike, str],
) -> None:
logger.info("Converting to mzTab")
self.validate_diann_version()
Expand Down Expand Up @@ -481,8 +481,8 @@ def main_report_df(self, qvalue_threshold: float) -> pd.DataFrame:
}
mass_vector = report["Modified.Sequence"].map(uniq_masses)
report["Calculate.Precursor.Mz"] = (
mass_vector + (PROTON_MASS_U * report["Precursor.Charge"])
) / report["Precursor.Charge"]
mass_vector + (PROTON_MASS_U * report["Precursor.Charge"])
) / report["Precursor.Charge"]

logger.debug("Indexing Precursors")
# Making the map is 1500x faster
Expand Down Expand Up @@ -589,16 +589,16 @@ def mztab_mtd(index_ref, dia_params, fasta, charge, missed_cleavages):
out_mztab_mtd.loc[1, "software[1]-setting[1]"] = fasta
out_mztab_mtd.loc[1, "software[1]-setting[2]"] = "db_version:null"
out_mztab_mtd.loc[1, "software[1]-setting[3]"] = (
"fragment_mass_tolerance:" + fragment_mass_tolerance
"fragment_mass_tolerance:" + fragment_mass_tolerance
)
out_mztab_mtd.loc[1, "software[1]-setting[4]"] = (
"fragment_mass_tolerance_unit:" + fragment_mass_tolerance_unit
"fragment_mass_tolerance_unit:" + fragment_mass_tolerance_unit
)
out_mztab_mtd.loc[1, "software[1]-setting[5]"] = (
"precursor_mass_tolerance:" + precursor_mass_tolerance
"precursor_mass_tolerance:" + precursor_mass_tolerance
)
out_mztab_mtd.loc[1, "software[1]-setting[6]"] = (
"precursor_mass_tolerance_unit:" + precursor_mass_tolerance_unit
"precursor_mass_tolerance_unit:" + precursor_mass_tolerance_unit
)
out_mztab_mtd.loc[1, "software[1]-setting[7]"] = "enzyme:" + enzyme
out_mztab_mtd.loc[1, "software[1]-setting[8]"] = "enzyme_term_specificity:full"
Expand All @@ -607,10 +607,10 @@ def mztab_mtd(index_ref, dia_params, fasta, charge, missed_cleavages):
missed_cleavages
)
out_mztab_mtd.loc[1, "software[1]-setting[11]"] = (
"fixed_modifications:" + fixed_modifications
"fixed_modifications:" + fixed_modifications
)
out_mztab_mtd.loc[1, "software[1]-setting[12]"] = (
"variable_modifications:" + variable_modifications
"variable_modifications:" + variable_modifications
)

(fixed_mods, variable_mods, fix_flag, var_flag) = mtd_mod_info(
Expand All @@ -633,7 +633,7 @@ def mztab_mtd(index_ref, dia_params, fasta, charge, missed_cleavages):
]
out_mztab_mtd.loc[1, "variable_mod[" + str(i) + "]-site"] = variable_mods[
i - 1
][1]
][1]
out_mztab_mtd.loc[1, "variable_mod[" + str(i) + "]-position"] = "Anywhere"
else:
out_mztab_mtd.loc[1, "variable_mod[1]"] = variable_mods[0]
Expand All @@ -649,8 +649,8 @@ def mztab_mtd(index_ref, dia_params, fasta, charge, missed_cleavages):
"[MS, MS:1000584, mzML file, ]"
)
out_mztab_mtd.loc[1, "ms_run[" + str(i) + "]-location"] = (
"file://"
+ index_ref[index_ref["ms_run"] == i]["Spectra_Filepath"].values[0]
"file://"
+ index_ref[index_ref["ms_run"] == i]["Spectra_Filepath"].values[0]
)
out_mztab_mtd.loc[1, "ms_run[" + str(i) + "]-id_format"] = (
"[MS, MS:1000777, spectrum identifier nativeID format, ]"
Expand All @@ -659,7 +659,7 @@ def mztab_mtd(index_ref, dia_params, fasta, charge, missed_cleavages):
"[MS, MS:1002038, unlabeled sample, ]"
)
out_mztab_mtd.loc[1, "assay[" + str(i) + "]-ms_run_ref"] = (
"ms_run[" + str(i) + "]"
"ms_run[" + str(i) + "]"
)

with warnings.catch_warnings():
Expand Down Expand Up @@ -723,9 +723,9 @@ def mztab_prh(report, pg, index_ref, database, fasta_df):
col = {}
for i in file:
col[i] = (
"protein_abundance_assay["
+ str(index_ref[index_ref["Run"] == _true_stem(i)]["ms_run"].values[0])
+ "]"
"protein_abundance_assay["
+ str(index_ref[index_ref["Run"] == _true_stem(i)]["ms_run"].values[0])
+ "]"
)

pg.rename(columns=col, inplace=True)
Expand Down Expand Up @@ -767,7 +767,7 @@ def mztab_prh(report, pg, index_ref, database, fasta_df):

protein_details_df = out_mztab_prh[
out_mztab_prh["opt_global_result_type"] == "indistinguishable_protein_group"
]
]
prh_series = (
protein_details_df["Protein.Ids"]
.str.split(";", expand=True)
Expand Down Expand Up @@ -884,11 +884,11 @@ def mztab_prh(report, pg, index_ref, database, fasta_df):


def mztab_peh(
report: pd.DataFrame,
pr: pd.DataFrame,
precursor_list: List[str],
index_ref: pd.DataFrame,
database: os.PathLike,
report: pd.DataFrame,
pr: pd.DataFrame,
precursor_list: List[str],
index_ref: pd.DataFrame,
database: os.PathLike,
) -> pd.DataFrame:
"""
Construct PEH sub-table.
Expand Down Expand Up @@ -1106,8 +1106,8 @@ def __find_info(directory, n):
# Standardize spectrum identifier format for bruker data
if not isinstance(target.loc[0, "opt_global_spectrum_reference"], str):
target.loc[:, "opt_global_spectrum_reference"] = "scan=" + target.loc[
:, "opt_global_spectrum_reference"
].astype(str)
:, "opt_global_spectrum_reference"
].astype(str)

# TODO seconds returned from precursor.getRT()
target.loc[:, "RT.Start"] = target.apply(lambda x: x["RT.Start"] / 60, axis=1)
Expand Down Expand Up @@ -1184,7 +1184,7 @@ def __find_info(directory, n):

out_mztab_psh.loc[:, "spectra_ref"] = out_mztab_psh.apply(
lambda x: "ms_run[{}]:".format(x["ms_run"])
+ x["opt_global_spectrum_reference"],
+ x["opt_global_spectrum_reference"],
axis=1,
result_type="expand",
)
Expand Down Expand Up @@ -1556,8 +1556,8 @@ def calculate_coverage(ref_sequence: str, sequences: Set[str]):
for start, length in sorted(zip(starts, lengths)):
if merged_starts and merged_starts[-1] + merged_lengths[-1] >= start:
merged_lengths[-1] = (
max(merged_starts[-1] + merged_lengths[-1], start + length)
- merged_starts[-1]
max(merged_starts[-1] + merged_lengths[-1], start + length)
- merged_starts[-1]
)
else:
merged_starts.append(start)
Expand All @@ -1569,7 +1569,7 @@ def calculate_coverage(ref_sequence: str, sequences: Set[str]):


def calculate_protein_coverages(
report: pd.DataFrame, out_mztab_prh: pd.DataFrame, fasta_df: pd.DataFrame
report: pd.DataFrame, out_mztab_prh: pd.DataFrame, fasta_df: pd.DataFrame
) -> List[str]:
"""Calculates protein coverages for the PRH table.
Expand Down
9 changes: 8 additions & 1 deletion quantmsutils/diann/dianncfg.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ def convert_mod(unimod_database, fix_mod: str, var_mod: str) -> Tuple[List, List
if fix_mod != "":
for mod in fix_mod.split(","):
tag = 0
diann_mod = None
for modification in unimod_database.modifications:
if modification.get_name() == mod.split(" ")[0]:
diann_mod = (
Expand Down Expand Up @@ -80,12 +81,18 @@ def convert_mod(unimod_database, fix_mod: str, var_mod: str) -> Tuple[List, List
or "mTRAQ" in diann_mod
):
fix_ptm.append(diann_mod + "," + site + "," + "label")
else:
elif diann_mod is not None:
fix_ptm.append(diann_mod + "," + site)
else:
print(
"Warning: Currently only supported unimod modifications for DIA pipeline. Skipped: "
+ mod
)

if var_mod != "":
for mod in var_mod.split(","):
tag = 0
diann_mod = None
for modification in unimod_database.modifications:
if modification.get_name() == mod.split(" ")[0]:
diann_mod = (
Expand Down
1 change: 1 addition & 0 deletions quantmsutils/mzml/mzml_statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ def mzml_statistics(ctx, ms_path: str, id_only: bool = False) -> None:
# Command line usage example
python script_name.py mzml_statistics --ms_path "path/to/file.mzML"
:param ctx: Click context
:param ms_path: A string specifying the path to the mass spectrometry file.
:param id_only: A boolean flag that, when set to True, generates a CSV file containing only the spectrum ID and
peaks data for MS level 2 spectra.
Expand Down
8 changes: 7 additions & 1 deletion quantmsutils/quantmsutilsc.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,5 +31,11 @@ def cli():

cli.add_command(convert_psm)

def main():
try:
cli()
except SystemExit as e:
if e.code != 0:
raise
if __name__ == "__main__":
cli()
main()
Loading

0 comments on commit d8da3c4

Please sign in to comment.