Skip to content

Commit

Permalink
fixed diann scan and add ms2rescore arg
Browse files Browse the repository at this point in the history
  • Loading branch information
daichengxin committed Sep 9, 2024
1 parent 25a6daa commit 9cb1ab0
Show file tree
Hide file tree
Showing 3 changed files with 118 additions and 79 deletions.
2 changes: 1 addition & 1 deletion quantmsutils/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.0.10"
__version__ = "0.0.11"
169 changes: 91 additions & 78 deletions quantmsutils/diann/diann2mztab.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,14 +44,14 @@
@click.option("--qvalue_threshold", "-q", type=float)
@click.pass_context
def diann2mztab(
ctx,
folder,
exp_design,
dia_params,
diann_version,
charge,
missed_cleavages,
qvalue_threshold,
ctx,
folder,
exp_design,
dia_params,
diann_version,
charge,
missed_cleavages,
qvalue_threshold,
):
"""
Convert DIA-NN output to MSstats, Triqler or mzTab.
Expand Down Expand Up @@ -228,7 +228,7 @@ def get_exp_design_dfs(exp_design_file):
lambda x: _true_stem(x["Spectra_Filepath"]), axis=1
)

s_table = [i.replace("\n", "").split("\t") for i in data[empty_row + 1 :]][1:]
s_table = [i.replace("\n", "").split("\t") for i in data[empty_row + 1:]][1:]
s_header = data[empty_row + 1].replace("\n", "").split("\t")
s_data_frame = pd.DataFrame(s_table, columns=s_header)

Expand Down Expand Up @@ -265,31 +265,31 @@ def compute_mass_modified_peptide(peptide_seq: str) -> float:
if aa in aa_mass and not_mod:
aa = aa_mass[aa]
elif (
aa
not in [
"G",
"A",
"V",
"L",
"I",
"F",
"M",
"P",
"W",
"S",
"C",
"T",
"Y",
"N",
"Q",
"D",
"E",
"K",
"R",
"H",
]
and not_mod
and aa != ")"
aa
not in [
"G",
"A",
"V",
"L",
"I",
"F",
"M",
"P",
"W",
"S",
"C",
"T",
"Y",
"N",
"Q",
"D",
"E",
"K",
"R",
"H",
]
and not_mod
and aa != ")"
):
logger.info(f"Unknown amino acid with mass not known:{aa}")
peptide_parts.append(aa)
Expand Down Expand Up @@ -367,13 +367,13 @@ def validate_diann_version(self) -> None:
raise ValueError(f"Unsupported DIANN version {self.diann_version}")

def convert_to_mztab(
self,
report,
f_table,
charge: int,
missed_cleavages: int,
dia_params: List[Any],
out: Union[os.PathLike, str],
self,
report,
f_table,
charge: int,
missed_cleavages: int,
dia_params: List[Any],
out: Union[os.PathLike, str],
) -> None:
logger.info("Converting to mzTab")
self.validate_diann_version()
Expand Down Expand Up @@ -450,7 +450,8 @@ def main_report_df(self, qvalue_threshold: float) -> pd.DataFrame:
"Protein.Ids",
"First.Protein.Description",
"PG.MaxLFQ",
"RT.Start",
"RT",
"MS2.Scan",
"Global.Q.Value",
"Lib.Q.Value",
"PEP",
Expand All @@ -462,6 +463,7 @@ def main_report_df(self, qvalue_threshold: float) -> pd.DataFrame:
"Precursor.Charge",
"Precursor.Quantity",
"Global.PG.Q.Value",
"MS2.Scan"
]
report = pd.read_csv(self.report, sep="\t", header=0, usecols=remain_cols)

Expand All @@ -481,8 +483,8 @@ def main_report_df(self, qvalue_threshold: float) -> pd.DataFrame:
}
mass_vector = report["Modified.Sequence"].map(uniq_masses)
report["Calculate.Precursor.Mz"] = (
mass_vector + (PROTON_MASS_U * report["Precursor.Charge"])
) / report["Precursor.Charge"]
mass_vector + (PROTON_MASS_U * report["Precursor.Charge"])
) / report["Precursor.Charge"]

logger.debug("Indexing Precursors")
# Making the map is 1500x faster
Expand Down Expand Up @@ -589,16 +591,16 @@ def mztab_mtd(index_ref, dia_params, fasta, charge, missed_cleavages):
out_mztab_mtd.loc[1, "software[1]-setting[1]"] = fasta
out_mztab_mtd.loc[1, "software[1]-setting[2]"] = "db_version:null"
out_mztab_mtd.loc[1, "software[1]-setting[3]"] = (
"fragment_mass_tolerance:" + fragment_mass_tolerance
"fragment_mass_tolerance:" + fragment_mass_tolerance
)
out_mztab_mtd.loc[1, "software[1]-setting[4]"] = (
"fragment_mass_tolerance_unit:" + fragment_mass_tolerance_unit
"fragment_mass_tolerance_unit:" + fragment_mass_tolerance_unit
)
out_mztab_mtd.loc[1, "software[1]-setting[5]"] = (
"precursor_mass_tolerance:" + precursor_mass_tolerance
"precursor_mass_tolerance:" + precursor_mass_tolerance
)
out_mztab_mtd.loc[1, "software[1]-setting[6]"] = (
"precursor_mass_tolerance_unit:" + precursor_mass_tolerance_unit
"precursor_mass_tolerance_unit:" + precursor_mass_tolerance_unit
)
out_mztab_mtd.loc[1, "software[1]-setting[7]"] = "enzyme:" + enzyme
out_mztab_mtd.loc[1, "software[1]-setting[8]"] = "enzyme_term_specificity:full"
Expand All @@ -607,10 +609,10 @@ def mztab_mtd(index_ref, dia_params, fasta, charge, missed_cleavages):
missed_cleavages
)
out_mztab_mtd.loc[1, "software[1]-setting[11]"] = (
"fixed_modifications:" + fixed_modifications
"fixed_modifications:" + fixed_modifications
)
out_mztab_mtd.loc[1, "software[1]-setting[12]"] = (
"variable_modifications:" + variable_modifications
"variable_modifications:" + variable_modifications
)

(fixed_mods, variable_mods, fix_flag, var_flag) = mtd_mod_info(
Expand All @@ -633,7 +635,7 @@ def mztab_mtd(index_ref, dia_params, fasta, charge, missed_cleavages):
]
out_mztab_mtd.loc[1, "variable_mod[" + str(i) + "]-site"] = variable_mods[
i - 1
][1]
][1]
out_mztab_mtd.loc[1, "variable_mod[" + str(i) + "]-position"] = "Anywhere"
else:
out_mztab_mtd.loc[1, "variable_mod[1]"] = variable_mods[0]
Expand All @@ -649,8 +651,8 @@ def mztab_mtd(index_ref, dia_params, fasta, charge, missed_cleavages):
"[MS, MS:1000584, mzML file, ]"
)
out_mztab_mtd.loc[1, "ms_run[" + str(i) + "]-location"] = (
"file://"
+ index_ref[index_ref["ms_run"] == i]["Spectra_Filepath"].values[0]
"file://"
+ index_ref[index_ref["ms_run"] == i]["Spectra_Filepath"].values[0]
)
out_mztab_mtd.loc[1, "ms_run[" + str(i) + "]-id_format"] = (
"[MS, MS:1000777, spectrum identifier nativeID format, ]"
Expand All @@ -659,7 +661,7 @@ def mztab_mtd(index_ref, dia_params, fasta, charge, missed_cleavages):
"[MS, MS:1002038, unlabeled sample, ]"
)
out_mztab_mtd.loc[1, "assay[" + str(i) + "]-ms_run_ref"] = (
"ms_run[" + str(i) + "]"
"ms_run[" + str(i) + "]"
)

with warnings.catch_warnings():
Expand Down Expand Up @@ -723,9 +725,9 @@ def mztab_prh(report, pg, index_ref, database, fasta_df):
col = {}
for i in file:
col[i] = (
"protein_abundance_assay["
+ str(index_ref[index_ref["Run"] == _true_stem(i)]["ms_run"].values[0])
+ "]"
"protein_abundance_assay["
+ str(index_ref[index_ref["Run"] == _true_stem(i)]["ms_run"].values[0])
+ "]"
)

pg.rename(columns=col, inplace=True)
Expand Down Expand Up @@ -766,7 +768,7 @@ def mztab_prh(report, pg, index_ref, database, fasta_df):

protein_details_df = out_mztab_prh[
out_mztab_prh["opt_global_result_type"] == "indistinguishable_protein_group"
]
]
prh_series = (
protein_details_df["Protein.Group"]
.str.split(";", expand=True)
Expand Down Expand Up @@ -883,11 +885,11 @@ def mztab_prh(report, pg, index_ref, database, fasta_df):


def mztab_peh(
report: pd.DataFrame,
pr: pd.DataFrame,
precursor_list: List[str],
index_ref: pd.DataFrame,
database: os.PathLike,
report: pd.DataFrame,
pr: pd.DataFrame,
precursor_list: List[str],
index_ref: pd.DataFrame,
database: os.PathLike,
) -> pd.DataFrame:
"""
Construct PEH sub-table.
Expand Down Expand Up @@ -1020,7 +1022,7 @@ def mztab_peh(
.agg(
{
"Q.Value": "min",
"RT.Start": "mean",
"RT": "mean",
"Global.Q.Value": "min",
"Lib.Q.Value": "min",
"Calculate.Precursor.Mz": "mean",
Expand All @@ -1031,7 +1033,7 @@ def mztab_peh(
columns={
"precursor.Index": "pr_id",
"Q.Value": "best_search_engine_score[1]",
"RT.Start": "retention_time",
"RT": "retention_time",
"Global.Q.Value": "opt_global_q-value",
"Lib.Q.Value": "opt_global_SpecEValue_score",
"Calculate.Precursor.Mz": "mass_to_charge",
Expand Down Expand Up @@ -1095,36 +1097,47 @@ def __find_info(directory, n):

file = __find_info(folder, n)
target = pd.read_parquet(file)
group.sort_values(by="RT.Start", inplace=True)
target = target[["Retention_Time", "SpectrumID", "Exp_Mass_To_Charge"]]
target = target[target["MSLevel"] == 2]
target.reset_index(inplace=True, drop=True)
target["DIANN-intraID"] = target.index
group.sort_values(by="RT", inplace=True)
target = target[["Retention_Time", "SpectrumID", "DIANN-intraID", "Exp_Mass_To_Charge"]]
target.columns = [
"RT.Start",
"RT",
"opt_global_spectrum_reference",
"DIANN-intraID",
"exp_mass_to_charge",
]
# Standardize spectrum identifier format for bruker data
if not isinstance(target.loc[0, "opt_global_spectrum_reference"], str):
target.loc[:, "opt_global_spectrum_reference"] = "scan=" + target.loc[
:, "opt_global_spectrum_reference"
].astype(str)
:, "opt_global_spectrum_reference"
].astype(str)

# TODO seconds returned from precursor.getRT()
target.loc[:, "RT.Start"] = target.apply(lambda x: x["RT.Start"] / 60, axis=1)
target.loc[:, "RT"] = target.apply(lambda x: x["RT"] / 60, axis=1)
out_mztab_psh = pd.concat(
[
out_mztab_psh,
pd.merge_asof(group, target, on="RT.Start", direction="nearest"),
pd.merge_asof(group, target, on="RT", direction="nearest"),
]
)
del report

# Cross validation spectrum ID between scan matched and RT matched
if "MS2.Scan" in out_mztab_psh.columns:
out_mztab_psh["unassigned_matched"] = out_mztab_psh.apply(lambda row: 1 if row["MS2.Scan"] != row["DIANN-intraID"] else 0, axis=1)
if len(out_mztab_psh[out_mztab_psh["unassigned_matched"] == 1]) > 0:
v_str = out_mztab_psh[out_mztab_psh["unassigned_matched"] == 1]["MS2.Scan"].tolist()
raise ValueError(f"Can't matched correctly DIA-NN scan in mzML: {v_str}")

# Score at PSM level: Q.Value
out_mztab_psh = out_mztab_psh[
[
"Stripped.Sequence",
"Protein.Group",
"Q.Value",
"RT.Start",
"RT",
"Precursor.Charge",
"Calculate.Precursor.Mz",
"exp_mass_to_charge",
Expand Down Expand Up @@ -1183,7 +1196,7 @@ def __find_info(directory, n):

out_mztab_psh.loc[:, "spectra_ref"] = out_mztab_psh.apply(
lambda x: "ms_run[{}]:".format(x["ms_run"])
+ x["opt_global_spectrum_reference"],
+ x["opt_global_spectrum_reference"],
axis=1,
result_type="expand",
)
Expand Down Expand Up @@ -1476,7 +1489,7 @@ def per_peptide_study_report(report: pd.DataFrame) -> pd.DataFrame:
.agg(
{
"Precursor.Normalised": ["mean", "std", "sem"],
"RT.Start": ["mean"],
"RT": ["mean"],
"Calculate.Precursor.Mz": ["mean"],
}
)
Expand Down Expand Up @@ -1555,8 +1568,8 @@ def calculate_coverage(ref_sequence: str, sequences: Set[str]):
for start, length in sorted(zip(starts, lengths)):
if merged_starts and merged_starts[-1] + merged_lengths[-1] >= start:
merged_lengths[-1] = (
max(merged_starts[-1] + merged_lengths[-1], start + length)
- merged_starts[-1]
max(merged_starts[-1] + merged_lengths[-1], start + length)
- merged_starts[-1]
)
else:
merged_starts.append(start)
Expand All @@ -1568,7 +1581,7 @@ def calculate_coverage(ref_sequence: str, sequences: Set[str]):


def calculate_protein_coverages(
report: pd.DataFrame, out_mztab_prh: pd.DataFrame, fasta_df: pd.DataFrame
report: pd.DataFrame, out_mztab_prh: pd.DataFrame, fasta_df: pd.DataFrame
) -> List[str]:
"""Calculates protein coverages for the PRH table.
Expand Down
Loading

0 comments on commit 9cb1ab0

Please sign in to comment.