Merge pull request #36 from theGreatHerrLebert/tm_synthetics

transfer simulation framework from proteolizard
theGreatHerrLebert · Nov 27, 2023 · 58adac0 · 58adac0
2 parents 98d170d + 7389121
commit 58adac0
Show file tree

Hide file tree

Showing 18 changed files with 2,821 additions and 18 deletions.
diff --git a/imspy/examples/simulation/run_example_simulation.py b/imspy/examples/simulation/run_example_simulation.py
@@ -0,0 +1,101 @@
+from imspy.simulation.experiment import LcImsMsMs
+from imspy.simulation.hardware_models import (NeuralChromatographyApex, 
+                                              NormalChromatographyProfileModel, 
+                                              LiquidChromatography, 
+                                              ElectroSpray,
+                                              TrappedIon,
+                                              TOF,
+                                              NeuralIonMobilityApex, 
+                                              NormalIonMobilityProfileModel,
+                                              AveragineModel,
+                                              BinomialIonSource
+                                              )
+from imspy.proteome import ProteinSample, Trypsin, ORGANISM
+from imspy.chemistry.mass import BufferGas
+
+import pandas as pd
+import numpy as np
+
+
+def irt_to_rt(irt):
+    return irt
+
+
+def scan_im_interval(scan_id):
+    intercept = 1451.357
+    slope = -877.361
+    scan_id = np.atleast_1d(scan_id)
+    lower = ( scan_id    - intercept ) / slope
+    upper = ((scan_id+1) - intercept ) / slope
+    return np.stack([1/lower, 1/upper], axis=1)
+
+
+def im_to_scan(reduced_ion_mobility):
+    intercept = 1451.357
+    slope = -877.361
+    # TODO more appropriate function here ?
+    one_over_k0 = 1/reduced_ion_mobility
+    return np.round(one_over_k0 * slope + intercept).astype(np.int16)
+
+
+def build_experiment():
+    t = LcImsMsMs("./timstofexp1_binomial_ion_source_21_7/") # maybe rather call this class LCIMSMSExperiment
+
+    lc = LiquidChromatography()
+    lc.frame_length = 1200 #ms
+    lc.gradient_length = 120 # min
+    esi = ElectroSpray()
+    tims = TrappedIon()
+    tims.scan_time = 110 # us
+    tof_mz = TOF()
+
+    t.lc_method = lc
+    t.ionization_method = esi
+    t.ion_mobility_separation_method = tims
+    t.mz_separation_method = tof_mz
+
+    N2 = BufferGas("N2")
+
+    tokenizer_path = '/home/tim/Workspaces/ionmob/pretrained-models/tokenizers/tokenizer.json'
+
+    rt_model_weights = "/home/tim/Workspaces/Resources/models/DeepChromatograpy/"
+    t.lc_method.apex_model = NeuralChromatographyApex(rt_model_weights,tokenizer_path = tokenizer_path)
+
+    t.lc_method.profile_model = NormalChromatographyProfileModel()
+    t.lc_method.irt_to_rt_converter = irt_to_rt
+
+    im_model_weights = "/home/tim/Workspaces/ionmob/pretrained-models/GRUPredictor"
+    t.ion_mobility_separation_method.apex_model = NeuralIonMobilityApex(im_model_weights, tokenizer_path = tokenizer_path)
+
+    t.ion_mobility_separation_method.profile_model = NormalIonMobilityProfileModel()
+    t.ion_mobility_separation_method.buffer_gas = N2
+    t.ion_mobility_separation_method.scan_to_reduced_im_interval_converter = scan_im_interval
+    t.ion_mobility_separation_method.reduced_im_to_scan_converter = im_to_scan
+
+    t.ionization_method.ionization_model = BinomialIonSource()
+
+    t.mz_separation_method.model = AveragineModel()
+
+    rng = np.random.default_rng(2023)
+    # read proteome
+    proteome = pd.read_feather('/home/tim/Workspaces/Resources/Homo-sapiens-proteome.feather')
+    random_abundances = rng.integers(1e3,1e7,size=proteome.shape[0])
+    proteome = proteome.assign(abundancy= random_abundances)
+    # create sample and sample digest; TODO: add missed cleavages to ENZYMEs
+    sample = ProteinSample(proteome, ORGANISM.HOMO_SAPIENS)
+    sample_digest = sample.digest(Trypsin())
+
+    # to reduce computational load in example
+    sample_digest.data = sample_digest.data.sample(100, random_state= rng)
+
+
+    t.load_sample(sample_digest)
+    return t
+
+
+if __name__ == "__main__":
+
+    t = build_experiment()
+
+    #cProfile.run("t.run(10000)", filename="profiler_10000_8_process",sort="cumtime")
+    t.run(100, frames_per_assemble_process=10)
diff --git a/imspy/imspy/chemistry/mass.py b/imspy/imspy/chemistry/mass.py
@@ -1,11 +1,21 @@
 import re
 from collections import defaultdict
 import numpy as np
+import mendeleev as me
 
 MASS_PROTON = 1.007276466583
 MASS_NEUTRON = 1.00866491597
 MASS_ELECTRON = 0.00054857990946
-MASS_H2O = 18.0105646863
+MASS_WATER = 18.0105646863
+
+# IUPAC standard in Kelvin
+STANDARD_TEMPERATURE = 273.15
+# IUPAC standard in Pa
+STANDARD_PRESSURE = 1e5
+# IUPAC elementary charge
+ELEMENTARY_CHARGE = 1.602176634e-19
+# IUPAC BOLTZMANN'S CONSTANT
+K_BOLTZMANN = 1.380649e-23
 
 AMINO_ACIDS = {
     'Lysine': 'K',
@@ -146,7 +156,7 @@ def calculate_monoisotopic_mass(sequence):
     mass_sequence = np.sum([AMINO_ACID_MASSES[amino_acid] * count for amino_acid, count in aa_counts.items()])
     mass_modifics = np.sum([MODIFICATIONS_MZ_NUMERICAL[mod] * count for mod, count in mod_counts.items()])
 
-    return mass_sequence + mass_modifics + MASS_H2O
+    return mass_sequence + mass_modifics + MASS_WATER
 
 
 def calculate_mz(monoisotopic_mass, charge):
@@ -174,3 +184,151 @@ def calculate_mz_from_sequence(sequence, charge):
     float: The m/z of the sequence.
     """
     return calculate_mz(calculate_monoisotopic_mass(sequence), charge)
+
+
+def get_monoisotopic_token_weight(token:str):
+    """
+    Gets monoisotopic weight of token
+
+    :param token: Token of aa sequence e.g. "<START>[UNIMOD:1]"
+    :type token: str
+    :return: Weight in Dalton.
+    :rtype: float
+    """
+    splits = token.split("[")
+    for i in range(1, len(splits)):
+        splits[i] = "["+splits[i]
+
+    mass = 0
+    for split in splits:
+        mass += AMINO_ACID_MASSES[split]
+    return mass
+
+
+def get_mono_isotopic_weight(sequence_tokenized: list[str]) -> float:
+    mass = 0
+    for token in sequence_tokenized:
+        mass += get_monoisotopic_token_weight(token)
+    return mass + MASS_WATER
+
+
+def get_mass_over_charge(mass: float, charge: int) -> float:
+    return (mass / charge) + MASS_PROTON
+
+
+def get_num_protonizable_sites(sequence: str) -> int:
+    """
+    Gets number of sites that can be protonized.
+    This function does not yet account for PTMs.
+
+    :param sequence: Amino acid sequence
+    :type sequence: str
+    :return: Number of protonizable sites
+    :rtype: int
+    """
+    sites = 1 # n-terminus
+    for s in sequence:
+        if s in ["H", "R", "K"]:
+            sites += 1
+    return sites
+
+
+class ChemicalCompound:
+
+    def _calculate_molecular_mass(self):
+        mass = 0
+        for (atom, abundance) in self.element_composition.items():
+            mass += me.element(atom).atomic_weight * abundance
+        return mass
+
+    def __init__(self, formula):
+        self.element_composition = self.get_composition(formula)
+        self.mass = self._calculate_molecular_mass()
+
+    def get_composition(self, formula: str):
+        """
+        Parse chemical formula into Dict[str:int] with
+        atoms as keys and the respective counts as values.
+
+        :param formula: Chemical formula of compound e.g. 'C6H12O6'
+        :type formula: str
+        :return: Dictionary Atom: Count
+        :rtype: Dict[str:int]
+        """
+        if formula.startswith("("):
+            assert formula.endswith(")")
+            formula = formula[1:-1]
+
+        tmp_group = ""
+        tmp_group_count = ""
+        depth = 0
+        comp_list = []
+        comp_counts = []
+
+        # extract components: everything in brackets and atoms
+        # extract component counts: number behind component or 1
+        for (i, e) in enumerate(formula):
+            if e == "(":
+                depth += 1
+                if depth == 1:
+                    if tmp_group != "":
+                        comp_list.append(tmp_group)
+                        tmp_group = ""
+                        if tmp_group_count == "":
+                            comp_counts.append(1)
+                        else:
+                            comp_counts.append(int(tmp_group_count))
+                            tmp_group_count = ""
+                tmp_group += e
+                continue
+            if e == ")":
+                depth -= 1
+                tmp_group += e
+                continue
+            if depth > 0:
+                tmp_group += e
+                continue
+            if e.isupper():
+                if tmp_group != "":
+                    comp_list.append(tmp_group)
+                    tmp_group = ""
+                    if tmp_group_count == "":
+                        comp_counts.append(1)
+                    else:
+                        comp_counts.append(int(tmp_group_count))
+                        tmp_group_count = ""
+                tmp_group += e
+                continue
+            if e.islower():
+                tmp_group += e
+                continue
+            if e.isnumeric():
+                tmp_group_count += e
+        if tmp_group != "":
+            comp_list.append(tmp_group)
+            if tmp_group_count == "":
+                comp_counts.append(1)
+            else:
+                comp_counts.append(int(tmp_group_count))
+
+        # assemble dictionary from component lists
+        atom_dict = {}
+        for (comp, count) in zip(comp_list, comp_counts):
+            if not comp.startswith("("):
+                atom_dict[comp] = count
+            else:
+                atom_dicts_depth = self.get_composition(comp)
+                for atom in atom_dicts_depth:
+                    atom_dicts_depth[atom] *= count
+                    if atom in atom_dict:
+                        atom_dict[atom] += atom_dicts_depth[atom]
+                    else:
+                        atom_dict[atom] = atom_dicts_depth[atom]
+                atom_dicts_depth = {}
+        return atom_dict
+
+
+class BufferGas(ChemicalCompound):
+
+    def __init__(self, formula: str):
+        super().__init__(formula)
diff --git a/imspy/imspy/chemistry/mobility.py b/imspy/imspy/chemistry/mobility.py
@@ -1,5 +1,7 @@
 import numpy as np
 
+SUMMARY_CONSTANT = 18509.8632163405
+
 
 def one_over_k0_to_ccs(one_over_k0, mz, charge, mass_gas=28.013, temp=31.85, t_diff=273.15):
     """
@@ -11,7 +13,6 @@ def one_over_k0_to_ccs(one_over_k0, mz, charge, mass_gas=28.013, temp=31.85, t_d
     :param temp: temperature of the drift gas in C°
     :param t_diff: factor to translate from C° to K
     """
-    SUMMARY_CONSTANT = 18509.8632163405
     reduced_mass = (mz * charge * mass_gas) / (mz * charge + mass_gas)
     return (SUMMARY_CONSTANT * charge) / (np.sqrt(reduced_mass * (temp + t_diff)) * 1 / one_over_k0)
 
@@ -26,6 +27,5 @@ def ccs_to_one_over_k0(ccs, mz, charge, mass_gas=28.013, temp=31.85, t_diff=273.
     :param temp: temperature of the drift gas in C°
     :param t_diff: factor to translate from C° to K
     """
-    SUMMARY_CONSTANT = 18509.8632163405
     reduced_mass = (mz * charge * mass_gas) / (mz * charge + mass_gas)
     return ((np.sqrt(reduced_mass * (temp + t_diff))) * ccs) / (SUMMARY_CONSTANT * charge)
diff --git a/imspy/imspy/core/frame.py b/imspy/imspy/core/frame.py
@@ -7,6 +7,7 @@
 
 import numpy as np
 import imspy_connector as pims
+
 from imspy.core.spectrum import MzSpectrum, TimsSpectrum, IndexedMzSpectrum
 
 from imspy.utility.utilities import re_index_indices

diff --git a/imspy/imspy/core/slice.py b/imspy/imspy/core/slice.py
@@ -8,6 +8,7 @@
 from imspy.utility.utilities import re_index_indices
 
 import imspy_connector as pims
+
 from imspy.core.frame import TimsFrame, TimsFrameVectorized
 from imspy.core.spectrum import MzSpectrum, TimsSpectrum