Noble-Lab · Lilferrit · Aug 21, 2024 · Jul 30, 2024 · Jul 30, 2024 · Jul 31, 2024
diff --git a/.github/ISSUE_TEMPLATE/bug_report_template.md b/.github/ISSUE_TEMPLATE/bug_report_template.md
@@ -0,0 +1,55 @@
+---
+name: Bug Report
+about: Submit a Casanovo Bug Report
+labels: bug
+---
+
+## Describe the Issue
+A clear and concise description of what the issue/bug is.
+
+## Steps To Reproduce
+Steps to reproduce the incorrect behavior.
+
+## Expected Behavior
+A clear and concise description of what you expected to happen.
+
+## Terminal Output (If Applicable)
+Provide any applicable console output in between the tick marks below.
+
+```
+
+```
+
+## Environment:
+- OS: [e.g. Windows 11, Windows 10, macOS 14, Ubuntu 24.04]
+- Casanovo Version: [e.g. 4.2.1]
+- Hardware Used (CPU or GPU, if GPU also GPU model and CUDA version): [e.g. GPU: NVIDIA GeForce RTX 2070, CUDA Version: 12.5]
+
+### Checking GPU Version
+
+The GPU model can be checked by typing `nvidia-smi` into a terminal/console window.
+An example of how to use this command is shown below.
+In this case, the CUDA version is 12.5 and the GPU model is GeForce RTX 2070.
+
+
+```
+(casanovo_env) C:\Users\<user>\OneDrive\Documents\casanovo>nvidia-smi
+Fri Aug  2 12:34:57 2024       
++-----------------------------------------------------------------------------------------+
+| NVIDIA-SMI 555.99                 Driver Version: 555.99         CUDA Version: 12.5     |
+|-----------------------------------------+------------------------+----------------------+
+| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
+| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
+|                                         |                        |               MIG M. |
+|=========================================+========================+======================|
+|   0  NVIDIA GeForce RTX 2070 ...  WDDM  |   00000000:01:00.0  On |                  N/A |
+| N/A   60C    P8             16W /   90W |    1059MiB /   8192MiB |      0%      Default |
+|                                         |                        |                  N/A |
++-----------------------------------------+------------------------+----------------------+
+```
+
+## Additional Context
+Add any other context about the problem here.
+
+## Attach Files
+Please attach all input files used and the full Casanovo log file.
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -42,7 +42,7 @@ jobs:
       run: |
         pytest --cov=casanovo tests/
     - name: Upload coverage to codecov
-      uses: codecov/codecov-action@v3
+      uses: codecov/codecov-action@v4
       with:
         token: ${{ secrets.CODECOV_TOKEN }}
         fail_ci_if_error: true
diff --git a/README.md b/README.md
@@ -7,7 +7,7 @@
 If you use Casanovo in your work, please cite the following publications:
 
 - Yilmaz, M., Fondrie, W. E., Bittremieux, W., Oh, S. & Noble, W. S. *De novo* mass spectrometry peptide sequencing with a transformer model. in *Proceedings of the 39th International Conference on Machine Learning - ICML '22* vol. 162 25514–25522 (PMLR, 2022). [https://proceedings.mlr.press/v162/yilmaz22a.html](https://proceedings.mlr.press/v162/yilmaz22a.html)
-- Yilmaz, M., Fondrie, W. E., Bittremieux, W., Nelson, R., Ananth, V., Oh, S. & Noble, W. S. Sequence-to-sequence translation from mass spectra to peptides with a transformer model. in *bioRxiv* (2023). [doi:10.1101/2023.01.03.522621](https://doi.org/10.1101/2023.01.03.522621)
+- Yilmaz, M., Fondrie, W. E., Bittremieux, W., Melendez, C.F., Nelson, R., Ananth, V., Oh, S. & Noble, W. S. Sequence-to-sequence translation from mass spectra to peptides with a transformer model. in *Nature Communications* **15**, 6427 (2024). [doi:10.1038/s41467-024-49731-x](https://doi.org/10.1038/s41467-024-49731-x)
 
 ## Documentation
 

diff --git a/casanovo/casanovo.py b/casanovo/casanovo.py
@@ -128,64 +128,50 @@ def main() -> None:
     nargs=-1,
     type=click.Path(exists=True, dir_okay=False),
 )
+@click.option(
+    "--evaluate",
+    "-e",
+    is_flag=True,
+    default=False,
+    help="""
+    Run in evaluation mode. When this flag is set the peptide and amino
+    acid precision will be calculate and logged at the end of the sequencing
+    run. All input files must be annotated MGF files if running in evaluation
+    mode.
+    """,
+)
 def sequence(
     peak_path: Tuple[str],
     model: Optional[str],
     config: Optional[str],
     output: Optional[str],
     verbosity: str,
+    evaluate: bool,
 ) -> None:
     """De novo sequence peptides from tandem mass spectra.
 
-    PEAK_PATH must be one or more mzMl, mzXML, or MGF files from which
-    to sequence peptides.
+    PEAK_PATH must be one or more mzML, mzXML, or MGF files from which
+    to sequence peptides. If evaluate is set to true peak_path must be
+    one or more annotated MGF file.
     """
     output = setup_logging(output, verbosity)
     config, model = setup_model(model, config, output, False)
     start_time = time.time()
     with ModelRunner(config, model) as runner:
-        logger.info("Sequencing peptides from:")
+        logger.info(
+            "Sequencing %speptides from:",
+            "and evaluating " if evaluate else "",
+        )
         for peak_file in peak_path:
             logger.info("  %s", peak_file)
 
-        runner.predict(peak_path, output)
+        runner.predict(peak_path, output, evaluate=evaluate)
         psms = runner.writer.psms
         utils.log_sequencing_report(
             psms, start_time=start_time, end_time=time.time()
         )
 
 
-@main.command(cls=_SharedParams)
-@click.argument(
-    "annotated_peak_path",
-    required=True,
-    nargs=-1,
-    type=click.Path(exists=True, dir_okay=False),
-)
-def evaluate(
-    annotated_peak_path: Tuple[str],
-    model: Optional[str],
-    config: Optional[str],
-    output: Optional[str],
-    verbosity: str,
-) -> None:
-    """Evaluate de novo peptide sequencing performance.
-
-    ANNOTATED_PEAK_PATH must be one or more annoated MGF files,
-    such as those provided by MassIVE-KB.
-    """
-    output = setup_logging(output, verbosity)
-    config, model = setup_model(model, config, output, False)
-    start_time = time.time()
-    with ModelRunner(config, model) as runner:
-        logger.info("Sequencing and evaluating peptides from:")
-        for peak_file in annotated_peak_path:
-            logger.info("  %s", peak_file)
-
-        runner.evaluate(annotated_peak_path)
-        utils.log_run_report(start_time=start_time, end_time=time.time())
-
-
 @main.command(cls=_SharedParams)
 @click.argument(
     "train_peak_path",

diff --git a/casanovo/data/datasets.py b/casanovo/data/datasets.py
@@ -83,7 +83,9 @@ def __getitem__(
             The unique spectrum identifier, formed by its original peak file and
             identifier (index or scan number) therein.
         """
-        mz_array, int_array, precursor_mz, precursor_charge = self.index[idx]
+        mz_array, int_array, precursor_mz, precursor_charge = self.index[idx][
+            :4
+        ]
         spectrum = self._process_peaks(
             mz_array, int_array, precursor_mz, precursor_charge
         )

diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py
@@ -10,6 +10,7 @@
 from pathlib import Path
 from typing import Iterable, List, Optional, Union
 
+import depthcharge.masses
 import lightning.pytorch as pl
 import numpy as np
 import torch
@@ -20,6 +21,7 @@
 from ..config import Config
 from ..data import ms_io
 from ..denovo.dataloaders import DeNovoDataModule
+from ..denovo.evaluate import aa_match_batch, aa_match_metrics
 from ..denovo.model import Spec2Pep
 
 
@@ -116,36 +118,52 @@
             self.loaders.val_dataloader(),
         )
 
-    def evaluate(self, peak_path: Iterable[str]) -> None:
-        """Evaluate peptide sequence preditions from a trained Casanovo model.
+    def log_metrics(self, test_index: AnnotatedSpectrumIndex) -> None:
+        """Log pep_precision and aa_precision
+
+        Calculate and log peptide precision and amino acid precision
+        based off of model predictions and spectrum annotations
 
         Parameters
         ----------
-        peak_path : iterable of str
-            The path with MS data files for predicting peptide sequences.
-
-        Returns
-        -------
-        self
+        test_index : AnnotatedSpectrumIndex
+            Index containing the annotated spectra used to generate model
+            predictions
         """
-        self.initialize_trainer(train=False)
-        self.initialize_model(train=False)
-
-        test_index = self._get_index(peak_path, True, "evaluation")
-        self.initialize_data_module(test_index=test_index)
-        self.loaders.setup(stage="test", annotated=True)
+        model_output = [psm[0] for psm in self.writer.psms]
+        spectrum_annotations = [
+            test_index[i][4] for i in range(test_index.n_spectra)
+        ]
+        aa_precision, _, pep_precision = aa_match_metrics(
+            *aa_match_batch(
+                spectrum_annotations,
+                model_output,
+                depthcharge.masses.PeptideMass().masses,
+            )
+        )
 
-        self.trainer.validate(self.model, self.loaders.test_dataloader())
+        logger.info("Peptide Precision: %f", pep_precision)
+        logger.info("Amino Acid Precision: %f", aa_precision)
 
-    def predict(self, peak_path: Iterable[str], output: str) -> None:
+    def predict(
+        self, peak_path: Iterable[str], output: str, evaluate: bool = False
+    ) -> None:
         """Predict peptide sequences with a trained Casanovo model.
 
+        Can also evaluate model during prediction if provided with annotated
+        peak files.
+
         Parameters
         ----------
         peak_path : iterable of str
             The path with the MS data files for predicting peptide sequences.
         output : str
             Where should the output be saved?
+        evaluate: bool
+            whether to run model evaluation in addition to inference
+            Note: peak_path most point to annotated MS data files when
+            running model evaluation. Files that are not an annotated
+            peak file format will be ignored if evaluate is set to true.
 
         Returns
         -------
@@ -162,12 +180,15 @@
         self.initialize_model(train=False)
         self.model.out_writer = self.writer
 
-        test_index = self._get_index(peak_path, False, "")
+        test_index = self._get_index(peak_path, evaluate, "")
         self.writer.set_ms_run(test_index.ms_files)
         self.initialize_data_module(test_index=test_index)
         self.loaders.setup(stage="test", annotated=False)
         self.trainer.predict(self.model, self.loaders.test_dataloader())
 
+        if evaluate:
+            self.log_metrics(test_index)
+
     def initialize_trainer(self, train: bool) -> None:
         """Initialize the lightning Trainer.