fixing MOAMC

theGreatHerrLebert · Nov 27, 2023 · 7389121 · 7389121
2 parents 389396e + 98d170d
commit 7389121
Show file tree

Hide file tree

Showing 77 changed files with 2,432 additions and 422 deletions.
diff --git a/.github/workflows/imspy-connector-publish.yml b/.github/workflows/imspy-connector-publish.yml
@@ -0,0 +1,62 @@
+name: Build and Publish Rust Binding
+
+on:
+  release:
+    types: [published]
+
+permissions:
+  contents: read
+
+jobs:
+  build-and-publish:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [ubuntu-latest, windows-latest, macos-13]
+        python-version: ['3.11']
+        include:
+          - os: ubuntu-latest
+            python-version: '3.11'
+            publish: true
+          - os: windows-latest
+            python-version: '3.11'
+            publish: true
+          - os: macos-13
+            python-version: '3.11'
+            publish: true
+
+    steps:
+      - uses: actions/checkout@v3
+
+      - name: Set up Python
+        uses: actions/setup-python@v3
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Install Maturin
+        run: |
+          python -m pip install --upgrade pip
+          pip install maturin
+
+      - name: Set up Rust
+        uses: actions-rs/toolchain@v1
+        with:
+          profile: minimal
+          toolchain: stable
+          override: true
+
+      - name: Change to imspy-connector directory
+        run: cd imspy_connector
+
+      - name: Build with Maturin
+        run: |
+          cd imspy_connector
+          maturin build --release
+
+      - name: Publish
+        if: matrix.publish
+        env:
+          MATURIN_PYPI_TOKEN: ${{ secrets.IMSPY_CONNECTOR_PYPI_API_TOKEN }}
+        run: |
+          cd imspy_connector
+          maturin publish --no-sdist
diff --git a/.github/workflows/imspy-publish.yml b/.github/workflows/imspy-publish.yml
@@ -0,0 +1,48 @@
+name: Build and Publish Python Package
+
+on:
+  release:
+    types: [published]
+
+permissions:
+  contents: read
+
+jobs:
+  deploy:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ['3.11']
+        include:
+          - python-version: '3.11'
+            publish: true
+
+    steps:
+      - uses: actions/checkout@v3
+
+      - name: Set up Python
+        uses: actions/setup-python@v3
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Install Poetry
+        run: |
+          python -m pip install --upgrade pip
+          pip install poetry
+
+      - name: Change to imspy directory
+        run: cd imspy
+
+      - name: Build package
+        run: |
+          cd imspy
+          poetry build
+
+      - name: Publish package
+        if: matrix.publish
+        env:
+          POETRY_PYPI_TOKEN_PYPI: ${{ secrets.IMSPY_PYPI_API_TOKEN }}
+        run: |
+          cd imspy
+          poetry config http-basic.pypi __token__ $POETRY_PYPI_TOKEN_PYPI
+          poetry publish
diff --git a/imspy/examples/simulation/run_example_simulation.py b/imspy/examples/simulation/run_example_simulation.py
@@ -9,16 +9,18 @@
                                               NormalIonMobilityProfileModel,
                                               AveragineModel,
                                               BinomialIonSource
-                                                )
+                                              )
 from imspy.proteome import ProteinSample, Trypsin, ORGANISM
-from imspy.chemistry import BufferGas
+from imspy.chemistry.mass import BufferGas
 
 import pandas as pd
 import numpy as np
 
+
 def irt_to_rt(irt):
     return irt
 
+
 def scan_im_interval(scan_id):
     intercept = 1451.357
     slope = -877.361
@@ -27,13 +29,15 @@ def scan_im_interval(scan_id):
     upper = ((scan_id+1) - intercept ) / slope
     return np.stack([1/lower, 1/upper], axis=1)
 
+
 def im_to_scan(reduced_ion_mobility):
     intercept = 1451.357
     slope = -877.361
     # TODO more appropriate function here ?
     one_over_k0 = 1/reduced_ion_mobility
     return np.round(one_over_k0 * slope + intercept).astype(np.int16)
 
+
 def build_experiment():
     t = LcImsMsMs("./timstofexp1_binomial_ion_source_21_7/") # maybe rather call this class LCIMSMSExperiment
 
@@ -60,10 +64,6 @@ def build_experiment():
     t.lc_method.profile_model = NormalChromatographyProfileModel()
     t.lc_method.irt_to_rt_converter = irt_to_rt
 
-
-
-
-
     im_model_weights = "/home/tim/Workspaces/ionmob/pretrained-models/GRUPredictor"
     t.ion_mobility_separation_method.apex_model = NeuralIonMobilityApex(im_model_weights, tokenizer_path = tokenizer_path)
 
@@ -74,10 +74,8 @@ def build_experiment():
 
     t.ionization_method.ionization_model = BinomialIonSource()
 
-
     t.mz_separation_method.model = AveragineModel()
 
-
     rng = np.random.default_rng(2023)
     # read proteome
     proteome = pd.read_feather('/home/tim/Workspaces/Resources/Homo-sapiens-proteome.feather')
@@ -87,18 +85,17 @@ def build_experiment():
     sample = ProteinSample(proteome, ORGANISM.HOMO_SAPIENS)
     sample_digest = sample.digest(Trypsin())
 
-
     # to reduce computational load in example
     sample_digest.data = sample_digest.data.sample(100, random_state= rng)
 
 
     t.load_sample(sample_digest)
     return t
 
-if __name__ == "__main__":
 
+if __name__ == "__main__":
 
     t = build_experiment()
 
     #cProfile.run("t.run(10000)", filename="profiler_10000_8_process",sort="cumtime")
-    t.run(100,frames_per_assemble_process=10)
+    t.run(100, frames_per_assemble_process=10)
diff --git a/imspy/imspy/__init__.py b/imspy/imspy/__init__.py
@@ -0,0 +1,2 @@
+from imspy.core.spectrum import TimsSpectrum, MzSpectrum
+from imspy.timstof.data import TimsDataset
diff --git a/imspy/imspy/algorithm/__init__.py b/imspy/imspy/algorithm/__init__.py
@@ -0,0 +1 @@
+from .mixture import GaussianMixtureModel
diff --git a/imspy/imspy/algorithm/ccs/__init__.py b/imspy/imspy/algorithm/ccs/__init__.py
diff --git a/imspy/imspy/algorithm/ccs/predictors.py b/imspy/imspy/algorithm/ccs/predictors.py
@@ -0,0 +1,170 @@
+import numpy as np
+import tensorflow as tf
+from abc import ABC, abstractmethod
+from numpy.typing import NDArray
+from imspy.chemistry import ccs_to_one_over_k0
+from scipy.optimize import curve_fit
+from imspy.utility import tokenize_unimod_sequence
+from imspy.algorithm.utilities import get_model_path
+
+
+def load_deep_ccs_predictor() -> tf.keras.models.Model:
+    """ Get a pretrained deep predictor model
+
+    Returns:
+        The pretrained deep predictor model
+    """
+    return tf.keras.models.load_model(get_model_path('DeepCCSPredictor'))
+
+
+class PeptideIonMobilityApex(ABC):
+    """
+    ABSTRACT INTERFACE for simulation of ion-mobility apex value
+    """
+
+    def __init__(self):
+        pass
+
+    @abstractmethod
+    def simulate_ion_mobilities(self, sequences: list[str], charges: list[int]) -> NDArray:
+        pass
+
+
+def get_sqrt_slopes_and_intercepts(mz: np.ndarray, charge: np.ndarray,
+                                   ccs: np.ndarray, fit_charge_state_one: bool = False) -> (np.ndarray, np.ndarray):
+    """
+
+    Args:
+        mz:
+        charge:
+        ccs:
+        fit_charge_state_one:
+
+    Returns:
+
+    """
+
+    if fit_charge_state_one:
+        slopes, intercepts = [], []
+    else:
+        slopes, intercepts = [0.0], [0.0]
+
+    if fit_charge_state_one:
+        c_begin = 1
+    else:
+        c_begin = 2
+
+    for c in range(c_begin, 5):
+        def fit_func(x, a, b):
+            return a * np.sqrt(x) + b
+
+        triples = list(filter(lambda x: x[1] == c, zip(mz, charge, ccs)))
+
+        mz_tmp, charge_tmp = np.array([x[0] for x in triples]), np.array([x[1] for x in triples])
+        ccs_tmp = np.array([x[2] for x in triples])
+
+        popt, _ = curve_fit(fit_func, mz_tmp, ccs_tmp)
+
+        slopes.append(popt[0])
+        intercepts.append(popt[1])
+
+    return np.array(slopes, np.float32), np.array(intercepts, np.float32)
+
+
+class ProjectToInitialSqrtCCS(tf.keras.layers.Layer):
+    """
+    Simple sqrt regression layer, calculates ccs value as linear mapping from mz, charge -> ccs
+    """
+
+    def __init__(self, slopes, intercepts):
+        super(ProjectToInitialSqrtCCS, self).__init__()
+        self.slopes = tf.constant([slopes])
+        self.intercepts = tf.constant([intercepts])
+
+    def call(self, inputs):
+        mz, charge = inputs[0], inputs[1]
+        # since charge is one-hot encoded, can use it to gate linear prediction by charge state
+        return tf.expand_dims(tf.reduce_sum((self.slopes * tf.sqrt(mz) + self.intercepts) * tf.squeeze(charge), axis=1),
+                              1)
+
+
+class GRUCCSPredictor(tf.keras.models.Model):
+    """
+    Deep Learning model combining initial linear fit with sequence based features, both scalar and complex
+    """
+
+    def __init__(self, slopes, intercepts, num_tokens,
+                 seq_len=50,
+                 emb_dim=128,
+                 gru_1=128,
+                 gru_2=64,
+                 rdo=0.0,
+                 do=0.2):
+        super(GRUCCSPredictor, self).__init__()
+        self.__seq_len = seq_len
+
+        self.initial = ProjectToInitialSqrtCCS(slopes, intercepts)
+
+        self.emb = tf.keras.layers.Embedding(input_dim=num_tokens + 1, output_dim=emb_dim, input_length=seq_len)
+
+        self.gru1 = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(gru_1, return_sequences=True,
+                                                                      name='GRU1'))
+
+        self.gru2 = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(gru_2, return_sequences=False,
+                                                                      name='GRU2',
+                                                                      recurrent_dropout=rdo))
+
+        self.dense1 = tf.keras.layers.Dense(128, activation='relu',
+                                            kernel_regularizer=tf.keras.regularizers.l1_l2(1e-3, 1e-3))
+        self.dense2 = tf.keras.layers.Dense(64, activation='relu',
+                                            kernel_regularizer=tf.keras.regularizers.l1_l2(1e-3, 1e-3))
+
+        self.dropout = tf.keras.layers.Dropout(do)
+
+        self.out = tf.keras.layers.Dense(1, activation=None)
+
+    def call(self, inputs):
+        """
+        :param inputs: should contain: (mz, charge_one_hot, seq_as_token_indices)
+        """
+        # get inputs
+        mz, charge, seq = inputs[0], inputs[1], inputs[2]
+        # sequence learning
+        x_recurrent = self.gru2(self.gru1(self.emb(seq)))
+        # concat to feed to dense layers
+        concat = tf.keras.layers.Concatenate()([charge, x_recurrent])
+        # regularize
+        d1 = self.dropout(self.dense1(concat))
+        d2 = self.dense2(d1)
+        # combine simple linear hypotheses with deep part
+        return self.initial([mz, charge]) + self.out(d2), self.out(d2)
+
+
+class DeepPeptideIonMobilityApex(PeptideIonMobilityApex):
+    def __init__(self, model: GRUCCSPredictor, tokenizer: tf.keras.preprocessing.text.Tokenizer):
+        super(DeepPeptideIonMobilityApex, self).__init__()
+        self.model = model
+        self.tokenizer = tokenizer
+
+    def _preprocess_sequences(self, sequences: list[str], pad_len: int = 50) -> NDArray:
+        char_tokens = [tokenize_unimod_sequence(sequence) for sequence in sequences]
+        char_tokens = self.tokenizer.texts_to_sequences(char_tokens)
+        char_tokens = tf.keras.preprocessing.sequence.pad_sequences(char_tokens, pad_len, padding='post')
+        return char_tokens
+
+    def simulate_ion_mobilities(self,
+                                sequences: list[str],
+                                charges: list[int],
+                                mz: list[float],
+                                verbose: bool = False,
+                                batch_size: int = 1024) -> NDArray:
+        tokenized_sequences = self._preprocess_sequences(sequences)
+
+        # prepare masses, charges, sequences
+        m = np.expand_dims(mz, 1)
+        charges_one_hot = tf.one_hot(np.array(charges) - 1, 4)
+
+        ds = tf.data.Dataset.from_tensor_slices(((m, charges_one_hot, tokenized_sequences), np.zeros_like(mz))).batch(batch_size)
+        ccs, _ = self.model.predict(ds, verbose=verbose)
+
+        return np.array([ccs_to_one_over_k0(c, m, z) for c, m, z in zip(ccs, mz, charges)])
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		from imspy.core.spectrum import TimsSpectrum, MzSpectrum
		from imspy.timstof.data import TimsDataset