Skip to content

Commit

Permalink
Merge pull request #36 from theGreatHerrLebert/tm_synthetics
Browse files Browse the repository at this point in the history
transfer simulation framework from proteolizard
  • Loading branch information
theGreatHerrLebert authored Nov 27, 2023
2 parents 98d170d + 7389121 commit 58adac0
Show file tree
Hide file tree
Showing 18 changed files with 2,821 additions and 18 deletions.
101 changes: 101 additions & 0 deletions imspy/examples/simulation/run_example_simulation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
from imspy.simulation.experiment import LcImsMsMs
from imspy.simulation.hardware_models import (NeuralChromatographyApex,
NormalChromatographyProfileModel,
LiquidChromatography,
ElectroSpray,
TrappedIon,
TOF,
NeuralIonMobilityApex,
NormalIonMobilityProfileModel,
AveragineModel,
BinomialIonSource
)
from imspy.proteome import ProteinSample, Trypsin, ORGANISM
from imspy.chemistry.mass import BufferGas

import pandas as pd
import numpy as np


def irt_to_rt(irt):
return irt


def scan_im_interval(scan_id):
intercept = 1451.357
slope = -877.361
scan_id = np.atleast_1d(scan_id)
lower = ( scan_id - intercept ) / slope
upper = ((scan_id+1) - intercept ) / slope
return np.stack([1/lower, 1/upper], axis=1)


def im_to_scan(reduced_ion_mobility):
intercept = 1451.357
slope = -877.361
# TODO more appropriate function here ?
one_over_k0 = 1/reduced_ion_mobility
return np.round(one_over_k0 * slope + intercept).astype(np.int16)


def build_experiment():
t = LcImsMsMs("./timstofexp1_binomial_ion_source_21_7/") # maybe rather call this class LCIMSMSExperiment

lc = LiquidChromatography()
lc.frame_length = 1200 #ms
lc.gradient_length = 120 # min
esi = ElectroSpray()
tims = TrappedIon()
tims.scan_time = 110 # us
tof_mz = TOF()

t.lc_method = lc
t.ionization_method = esi
t.ion_mobility_separation_method = tims
t.mz_separation_method = tof_mz

N2 = BufferGas("N2")

tokenizer_path = '/home/tim/Workspaces/ionmob/pretrained-models/tokenizers/tokenizer.json'

rt_model_weights = "/home/tim/Workspaces/Resources/models/DeepChromatograpy/"
t.lc_method.apex_model = NeuralChromatographyApex(rt_model_weights,tokenizer_path = tokenizer_path)

t.lc_method.profile_model = NormalChromatographyProfileModel()
t.lc_method.irt_to_rt_converter = irt_to_rt

im_model_weights = "/home/tim/Workspaces/ionmob/pretrained-models/GRUPredictor"
t.ion_mobility_separation_method.apex_model = NeuralIonMobilityApex(im_model_weights, tokenizer_path = tokenizer_path)

t.ion_mobility_separation_method.profile_model = NormalIonMobilityProfileModel()
t.ion_mobility_separation_method.buffer_gas = N2
t.ion_mobility_separation_method.scan_to_reduced_im_interval_converter = scan_im_interval
t.ion_mobility_separation_method.reduced_im_to_scan_converter = im_to_scan

t.ionization_method.ionization_model = BinomialIonSource()

t.mz_separation_method.model = AveragineModel()

rng = np.random.default_rng(2023)
# read proteome
proteome = pd.read_feather('/home/tim/Workspaces/Resources/Homo-sapiens-proteome.feather')
random_abundances = rng.integers(1e3,1e7,size=proteome.shape[0])
proteome = proteome.assign(abundancy= random_abundances)
# create sample and sample digest; TODO: add missed cleavages to ENZYMEs
sample = ProteinSample(proteome, ORGANISM.HOMO_SAPIENS)
sample_digest = sample.digest(Trypsin())

# to reduce computational load in example
sample_digest.data = sample_digest.data.sample(100, random_state= rng)


t.load_sample(sample_digest)
return t


if __name__ == "__main__":

t = build_experiment()

#cProfile.run("t.run(10000)", filename="profiler_10000_8_process",sort="cumtime")
t.run(100, frames_per_assemble_process=10)
162 changes: 160 additions & 2 deletions imspy/imspy/chemistry/mass.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,21 @@
import re
from collections import defaultdict
import numpy as np
import mendeleev as me

MASS_PROTON = 1.007276466583
MASS_NEUTRON = 1.00866491597
MASS_ELECTRON = 0.00054857990946
MASS_H2O = 18.0105646863
MASS_WATER = 18.0105646863

# IUPAC standard in Kelvin
STANDARD_TEMPERATURE = 273.15
# IUPAC standard in Pa
STANDARD_PRESSURE = 1e5
# IUPAC elementary charge
ELEMENTARY_CHARGE = 1.602176634e-19
# IUPAC BOLTZMANN'S CONSTANT
K_BOLTZMANN = 1.380649e-23

AMINO_ACIDS = {
'Lysine': 'K',
Expand Down Expand Up @@ -146,7 +156,7 @@ def calculate_monoisotopic_mass(sequence):
mass_sequence = np.sum([AMINO_ACID_MASSES[amino_acid] * count for amino_acid, count in aa_counts.items()])
mass_modifics = np.sum([MODIFICATIONS_MZ_NUMERICAL[mod] * count for mod, count in mod_counts.items()])

return mass_sequence + mass_modifics + MASS_H2O
return mass_sequence + mass_modifics + MASS_WATER


def calculate_mz(monoisotopic_mass, charge):
Expand Down Expand Up @@ -174,3 +184,151 @@ def calculate_mz_from_sequence(sequence, charge):
float: The m/z of the sequence.
"""
return calculate_mz(calculate_monoisotopic_mass(sequence), charge)


def get_monoisotopic_token_weight(token:str):
"""
Gets monoisotopic weight of token
:param token: Token of aa sequence e.g. "<START>[UNIMOD:1]"
:type token: str
:return: Weight in Dalton.
:rtype: float
"""
splits = token.split("[")
for i in range(1, len(splits)):
splits[i] = "["+splits[i]

mass = 0
for split in splits:
mass += AMINO_ACID_MASSES[split]
return mass


def get_mono_isotopic_weight(sequence_tokenized: list[str]) -> float:
mass = 0
for token in sequence_tokenized:
mass += get_monoisotopic_token_weight(token)
return mass + MASS_WATER


def get_mass_over_charge(mass: float, charge: int) -> float:
return (mass / charge) + MASS_PROTON


def get_num_protonizable_sites(sequence: str) -> int:
"""
Gets number of sites that can be protonized.
This function does not yet account for PTMs.
:param sequence: Amino acid sequence
:type sequence: str
:return: Number of protonizable sites
:rtype: int
"""
sites = 1 # n-terminus
for s in sequence:
if s in ["H", "R", "K"]:
sites += 1
return sites


class ChemicalCompound:

def _calculate_molecular_mass(self):
mass = 0
for (atom, abundance) in self.element_composition.items():
mass += me.element(atom).atomic_weight * abundance
return mass

def __init__(self, formula):
self.element_composition = self.get_composition(formula)
self.mass = self._calculate_molecular_mass()

def get_composition(self, formula: str):
"""
Parse chemical formula into Dict[str:int] with
atoms as keys and the respective counts as values.
:param formula: Chemical formula of compound e.g. 'C6H12O6'
:type formula: str
:return: Dictionary Atom: Count
:rtype: Dict[str:int]
"""
if formula.startswith("("):
assert formula.endswith(")")
formula = formula[1:-1]

tmp_group = ""
tmp_group_count = ""
depth = 0
comp_list = []
comp_counts = []

# extract components: everything in brackets and atoms
# extract component counts: number behind component or 1
for (i, e) in enumerate(formula):
if e == "(":
depth += 1
if depth == 1:
if tmp_group != "":
comp_list.append(tmp_group)
tmp_group = ""
if tmp_group_count == "":
comp_counts.append(1)
else:
comp_counts.append(int(tmp_group_count))
tmp_group_count = ""
tmp_group += e
continue
if e == ")":
depth -= 1
tmp_group += e
continue
if depth > 0:
tmp_group += e
continue
if e.isupper():
if tmp_group != "":
comp_list.append(tmp_group)
tmp_group = ""
if tmp_group_count == "":
comp_counts.append(1)
else:
comp_counts.append(int(tmp_group_count))
tmp_group_count = ""
tmp_group += e
continue
if e.islower():
tmp_group += e
continue
if e.isnumeric():
tmp_group_count += e
if tmp_group != "":
comp_list.append(tmp_group)
if tmp_group_count == "":
comp_counts.append(1)
else:
comp_counts.append(int(tmp_group_count))

# assemble dictionary from component lists
atom_dict = {}
for (comp, count) in zip(comp_list, comp_counts):
if not comp.startswith("("):
atom_dict[comp] = count
else:
atom_dicts_depth = self.get_composition(comp)
for atom in atom_dicts_depth:
atom_dicts_depth[atom] *= count
if atom in atom_dict:
atom_dict[atom] += atom_dicts_depth[atom]
else:
atom_dict[atom] = atom_dicts_depth[atom]
atom_dicts_depth = {}
return atom_dict


class BufferGas(ChemicalCompound):

def __init__(self, formula: str):
super().__init__(formula)
4 changes: 2 additions & 2 deletions imspy/imspy/chemistry/mobility.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import numpy as np

SUMMARY_CONSTANT = 18509.8632163405


def one_over_k0_to_ccs(one_over_k0, mz, charge, mass_gas=28.013, temp=31.85, t_diff=273.15):
"""
Expand All @@ -11,7 +13,6 @@ def one_over_k0_to_ccs(one_over_k0, mz, charge, mass_gas=28.013, temp=31.85, t_d
:param temp: temperature of the drift gas in C°
:param t_diff: factor to translate from C° to K
"""
SUMMARY_CONSTANT = 18509.8632163405
reduced_mass = (mz * charge * mass_gas) / (mz * charge + mass_gas)
return (SUMMARY_CONSTANT * charge) / (np.sqrt(reduced_mass * (temp + t_diff)) * 1 / one_over_k0)

Expand All @@ -26,6 +27,5 @@ def ccs_to_one_over_k0(ccs, mz, charge, mass_gas=28.013, temp=31.85, t_diff=273.
:param temp: temperature of the drift gas in C°
:param t_diff: factor to translate from C° to K
"""
SUMMARY_CONSTANT = 18509.8632163405
reduced_mass = (mz * charge * mass_gas) / (mz * charge + mass_gas)
return ((np.sqrt(reduced_mass * (temp + t_diff))) * ccs) / (SUMMARY_CONSTANT * charge)
1 change: 1 addition & 0 deletions imspy/imspy/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

import numpy as np
import imspy_connector as pims

from imspy.core.spectrum import MzSpectrum, TimsSpectrum, IndexedMzSpectrum

from imspy.utility.utilities import re_index_indices
Expand Down
1 change: 1 addition & 0 deletions imspy/imspy/core/slice.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from imspy.utility.utilities import re_index_indices

import imspy_connector as pims

from imspy.core.frame import TimsFrame, TimsFrameVectorized
from imspy.core.spectrum import MzSpectrum, TimsSpectrum

Expand Down
Loading

0 comments on commit 58adac0

Please sign in to comment.