Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add unique test #129

Merged
merged 2 commits into from
Nov 23, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 11 additions & 3 deletions proteobench/modules/dda_quant/datapoint.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
import json
import numpy as np
from dataclasses import asdict, dataclass
from datetime import datetime

import numpy as np


@dataclass
class Datapoint:
"""Data used to stored the"""

# TODO add threshold value used for presence ion/peptidoform
id: str = None
search_engine: str = None
Expand All @@ -26,6 +28,7 @@ class Datapoint:
weighted_sum: int = 0
nr_prec: int = 0
is_temporary: bool = True
intermediate_hash: str = ""
# fixed_mods: [],
# variable_mods: [],
# max_number_mods_pep: int = 0,
Expand All @@ -45,13 +48,18 @@ def calculate_plot_data(self, df):
nr_missing_0 = 0
for spec in species:
f = len(df[df[spec] == True])
sum_s = np.nan_to_num(df[df[spec] == True]["1|2_expected_ratio_diff"], nan=0, neginf=-1000, posinf=1000).sum()
sum_s = np.nan_to_num(
df[df[spec] == True]["1|2_expected_ratio_diff"],
nan=0,
neginf=-1000,
posinf=1000,
).sum()
ratio = sum_s / f
prop_ratio = (f / len(df)) * ratio
prop_ratios.append(prop_ratio)
sum_ratios += prop_ratio
nr_missing_0 += f

# TODO rename/document code
self.weighted_sum = round(sum_ratios, ndigits=3)
self.nr_prec = len(df)
Expand Down
79 changes: 46 additions & 33 deletions proteobench/modules/dda_quant/module.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from __future__ import annotations

import datetime
import hashlib
import itertools
import os
import re
Expand All @@ -10,11 +11,14 @@
import numpy as np
import pandas as pd
import streamlit as st

from proteobench.github.gh import clone_repo, pr_github, read_results_json_repo
from proteobench.modules.dda_quant.datapoint import Datapoint
from proteobench.modules.dda_quant.parse import ParseInputs
from proteobench.modules.dda_quant.parse_settings import (
DDA_QUANT_RESULTS_REPO, ParseSettings)
DDA_QUANT_RESULTS_REPO,
ParseSettings,
)
from proteobench.modules.interfaces import ModuleInterface


Expand Down Expand Up @@ -129,13 +133,8 @@ def generate_datapoint(
current_datetime = datetime.datetime.now()
formatted_datetime = current_datetime.strftime("%Y%m%d_%H%M%S_%f")


result_datapoint = Datapoint(
id=input_format
+ "_"
+ user_input["version"]
+ "_"
+ formatted_datetime,
id=input_format + "_" + user_input["version"] + "_" + formatted_datetime,
search_engine=input_format,
software_version=user_input["version"],
fdr_psm=user_input["fdr_psm"],
Expand All @@ -150,6 +149,9 @@ def generate_datapoint(
missed_cleavages=user_input["allowed_missed_cleavage"],
min_pep_length=user_input["min_peptide_length"],
max_pep_length=user_input["max_peptide_length"],
intermediate_hash=int(
hashlib.sha1(intermediate.to_string().encode("utf-8")).hexdigest(), 16
),
)
result_datapoint.generate_id()
result_datapoint.calculate_plot_data(intermediate)
Expand All @@ -167,7 +169,7 @@ def load_input_file(self, input_csv: str, input_format: str) -> pd.DataFrame:
elif input_format == "AlphaPept":
input_data_frame = pd.read_csv(input_csv, low_memory=False)
elif input_format == "Sage":
input_data_frame = pd.read_csv(input_csv, sep='\t', low_memory=False)
input_data_frame = pd.read_csv(input_csv, sep="\t", low_memory=False)
elif input_format == "MSFragger":
input_data_frame = pd.read_csv(input_csv, low_memory=False, sep="\t")
elif input_format == "WOMBAT":
Expand All @@ -180,18 +182,17 @@ def load_input_file(self, input_csv: str, input_format: str) -> pd.DataFrame:
elif input_format == "Custom":
input_data_frame = pd.read_csv(input_csv, low_memory=False, sep="\t")


return input_data_frame

def add_current_data_point(self, all_datapoints, current_datapoint):
"""Add current data point to all data points and load them from file if empty. TODO: Not clear why is the df transposed here."""
if not isinstance(all_datapoints, pd.DataFrame):
#all_datapoints = pd.read_json(DDA_QUANT_RESULTS_PATH)
# all_datapoints = pd.read_json(DDA_QUANT_RESULTS_PATH)
all_datapoints = read_results_json_repo(DDA_QUANT_RESULTS_REPO)

all_datapoints["old_new"] = "old"
all_datapoints = all_datapoints.T

current_datapoint["old_new"] = "new"
all_datapoints = pd.concat([all_datapoints, current_datapoint], axis=1)
all_datapoints = all_datapoints.T.reset_index(drop=True)
Expand Down Expand Up @@ -222,8 +223,27 @@ def benchmarking(
all_datapoints = self.add_current_data_point(all_datapoints, current_datapoint)

# TODO check why there are NA and inf/-inf values
return intermediate_data_structure.fillna(0.0).replace([np.inf, -np.inf], 0), all_datapoints
return (
intermediate_data_structure.fillna(0.0).replace([np.inf, -np.inf], 0),
all_datapoints,
)

def check_new_unique_hash(self, datapoints):
current_datapoint = datapoints[datapoints["old_new"] == "new"]
all_datapoints_old = datapoints[datapoints["old_new"] == "old"]

set_current_datapoint = set(list(current_datapoint["intermediate_hash"]))
set_all_datapoints_old = set(list(all_datapoints_old["intermediate_hash"]))

overlap = set_current_datapoint.intersection(set_all_datapoints_old)

if len(overlap) > 0:
st.error(
f"The run you want to submit has been previously submitted \
under the identifier: {overlap}"
)
return False
return True

def clone_pr(
self,
Expand All @@ -236,21 +256,22 @@ def clone_pr(
):
t_dir = TemporaryDirectory().name

clone_repo(clone_dir=t_dir, token=token, remote_git=remote_git, username=username)
clone_repo(
clone_dir=t_dir, token=token, remote_git=remote_git, username=username
)
current_datapoint = temporary_datapoints.iloc[-1]
current_datapoint["is_temporary"] = False
all_datapoints = self.add_current_data_point(None, current_datapoint)

if not self.check_new_unique_hash(all_datapoints):
return

branch_name = current_datapoint["id"]

# do the pd.write_json() here!!!
print(os.path.join(t_dir, "results.json"))
f = open(os.path.join(t_dir, "results.json"), "w")

all_datapoints.to_json(
f,
orient="records",
indent=2
)

all_datapoints.to_json(f, orient="records", indent=2)

f.close()
commit_message = f"Added new run with id {branch_name} \n user comments: {submission_comments}"
Expand All @@ -261,14 +282,10 @@ def clone_pr(
remote_git=remote_git,
username=username,
branch_name=branch_name,
commit_message=commit_message
commit_message=commit_message,
)


def write_json_local_development(
self,
temporary_datapoints
):
def write_json_local_development(self, temporary_datapoints):
t_dir = TemporaryDirectory().name
os.mkdir(t_dir)

Expand All @@ -281,11 +298,7 @@ def write_json_local_development(
print(f"Writing the json to: {fname}")

f = open(os.path.join(t_dir, "results.json"), "w")

all_datapoints.to_json(
f,
orient="records",
indent=2
)

all_datapoints.to_json(f, orient="records", indent=2)

return os.path.join(t_dir, "results.json")