From 18689807c46bffab0740b88b10a768dfd49defdc Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Mon, 26 Aug 2024 12:09:38 -0700 Subject: [PATCH 01/24] file io console options --- casanovo/casanovo.py | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/casanovo/casanovo.py b/casanovo/casanovo.py index 0c220ddb..a84d5ea0 100644 --- a/casanovo/casanovo.py +++ b/casanovo/casanovo.py @@ -67,8 +67,13 @@ def __init__(self, *args, **kwargs) -> None: """, ), click.Option( - ("-o", "--output"), - help="The mzTab file to which results will be written.", + ("-o", "--output-dir"), + help="The destination directory for output files", + type=click.Path(dir_okay=True), + ), + click.Option( + ("-r", "--output-root"), + help="The root name for all output files", type=click.Path(dir_okay=False), ), click.Option( @@ -90,6 +95,15 @@ def __init__(self, *args, **kwargs) -> None: ), default="info", ), + click.Option( + ("-d", "--overwrite"), + help=""" + Whether to overwrite output files. + """, + is_flag=True, + show_default=True, + default=False, + ), ] @@ -144,8 +158,10 @@ def sequence( peak_path: Tuple[str], model: Optional[str], config: Optional[str], - output: Optional[str], + output_dir: Optional[str], + output_root: Optional[str], verbosity: str, + overwrite: bool, evaluate: bool, ) -> None: """De novo sequence peptides from tandem mass spectra. @@ -195,8 +211,10 @@ def train( validation_peak_path: Tuple[str], model: Optional[str], config: Optional[str], - output: Optional[str], + output_dir: Optional[str], + output_root: Optional[str], verbosity: str, + overwrite: bool, ) -> None: """Train a Casanovo model on your own data. From 8a346e02b805c19d933cf01f4aec54cd34ab58de Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Mon, 26 Aug 2024 14:32:27 -0700 Subject: [PATCH 02/24] output console io options --- casanovo/casanovo.py | 27 +++++++++++++++++------ casanovo/config.py | 2 +- casanovo/config.yaml | 2 -- casanovo/denovo/model_runner.py | 27 +++++++++++++++++++---- casanovo/utils.py | 38 ++++++++++++++++++++++++++++++--- 5 files changed, 80 insertions(+), 16 deletions(-) diff --git a/casanovo/casanovo.py b/casanovo/casanovo.py index a84d5ea0..f1b8bc8e 100644 --- a/casanovo/casanovo.py +++ b/casanovo/casanovo.py @@ -67,12 +67,12 @@ def __init__(self, *args, **kwargs) -> None: """, ), click.Option( - ("-o", "--output-dir"), + ("-f", "--output-dir"), help="The destination directory for output files", type=click.Path(dir_okay=True), ), click.Option( - ("-r", "--output-root"), + ("-o", "--output-root"), help="The root name for all output files", type=click.Path(dir_okay=False), ), @@ -170,10 +170,17 @@ def sequence( to sequence peptides. If evaluate is set to True PEAK_PATH must be one or more annotated MGF file. """ - output = setup_logging(output, verbosity) + output_dir = Path(output_dir) if output_dir is not None else Path.cwd() + output_base_name = output_dir / output_root + if output_root is not None and not overwrite: + base_pattern = re.escape(output_root) + patterns = [base_pattern + r"\.log", base_pattern + r"\.mztab"] + utils.check_dir(output_dir, patterns) + output = setup_logging(str(output_base_name), verbosity) + config, model = setup_model(model, config, output, False) start_time = time.time() - with ModelRunner(config, model) as runner: + with ModelRunner(config, model, output_root, output_dir, False) as runner: logger.info( "Sequencing %speptides from:", "and evaluating " if evaluate else "", @@ -221,10 +228,18 @@ def train( TRAIN_PEAK_PATH must be one or more annoated MGF files, such as those provided by MassIVE-KB, from which to train a new Casnovo model. """ - output = setup_logging(output, verbosity) + output_dir = Path(output_dir) if output_dir is not None else Path.cwd() + output_base_name = None + if output_root is not None and not overwrite: + output_base_name = output_dir / output_root + utils.check_dir(output_dir, [re.escape(output_root) + r"\.log"]) + output = setup_logging(output_base_name, verbosity) + config, model = setup_model(model, config, output, True) start_time = time.time() - with ModelRunner(config, model) as runner: + with ModelRunner( + config, model, output_root, output_dir, not overwrite + ) as runner: logger.info("Training a model from:") for peak_file in train_peak_path: logger.info(" %s", peak_file) diff --git a/casanovo/config.py b/casanovo/config.py index 453f7b15..dae46606 100644 --- a/casanovo/config.py +++ b/casanovo/config.py @@ -19,6 +19,7 @@ every_n_train_steps="val_check_interval", max_iters="cosine_schedule_period_iters", save_top_k=None, + model_save_folder_path=None, ) @@ -75,7 +76,6 @@ class Config: top_match=int, max_epochs=int, num_sanity_val_steps=int, - model_save_folder_path=str, val_check_interval=int, calculate_precision=bool, accelerator=str, diff --git a/casanovo/config.yaml b/casanovo/config.yaml index 3beb5f30..1fcc77b4 100644 --- a/casanovo/config.yaml +++ b/casanovo/config.yaml @@ -42,8 +42,6 @@ random_seed: 454 n_log: 1 # Tensorboard directory to use for keeping track of training metrics. tb_summarywriter: -# Path to saved checkpoints. -model_save_folder_path: "" # Model validation and checkpointing frequency in training steps. val_check_interval: 50_000 diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index b88c5542..cf443e1a 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -4,6 +4,7 @@ import glob import logging import os +import re import tempfile import uuid import warnings @@ -18,6 +19,7 @@ from lightning.pytorch.strategies import DDPStrategy from lightning.pytorch.callbacks import ModelCheckpoint +from .. import utils from ..config import Config from ..data import ms_io from ..denovo.dataloaders import DeNovoDataModule @@ -47,6 +49,8 @@ def __init__( config: Config, model_filename: Optional[str] = None, output_rootname: Optional[str] = None, + output_dir: Optional[str] = None, + overwrite_ckpt_check: bool = True, ) -> None: """Initialize a ModelRunner""" self.config = config @@ -59,20 +63,35 @@ def __init__( self.loaders = None self.writer = None - best_filename = "best" + filenames = ("{epoch}-{step}", "best") if output_rootname is not None: - best_filename = f"{output_rootname}.{best_filename}" + filenames = tuple( + f"{output_rootname}.{curr_name}" for curr_name in filenames + ) + curr_filename, best_filename = filenames + + if overwrite_ckpt_check: + patterns = [r"epoch=\d+\-step=\d+\.ckpt", r"best\.ckpt"] + if output_rootname is not None: + patterns = [ + re.escape(output_rootname + ".") + pattern + for pattern in patterns + ] + utils.check_dir(output_dir, patterns) # Configure checkpoints. self.callbacks = [ ModelCheckpoint( - dirpath=config.model_save_folder_path, + dirpath=output_dir, save_on_train_epoch_end=True, + filename=curr_filename, + enable_version_counter=False, ), ModelCheckpoint( - dirpath=config.model_save_folder_path, + dirpath=output_dir, monitor="valid_CELoss", filename=best_filename, + enable_version_counter=False, ), ] diff --git a/casanovo/utils.py b/casanovo/utils.py index fde6cd05..48a2cacf 100644 --- a/casanovo/utils.py +++ b/casanovo/utils.py @@ -1,15 +1,14 @@ """Small utility functions""" -import heapq import logging import os +import pathlib import platform import re import socket import sys -import time from datetime import datetime -from typing import Tuple, Dict, List, Optional +from typing import Tuple, Dict, List, Optional, Iterable import numpy as np import pandas as pd @@ -203,6 +202,8 @@ def log_sequencing_report( """ Log sequencing run report + Parameters + ---------- next_prediction : Tuple[ str, Tuple[str, str], float, float, float, float, str ] @@ -251,3 +252,34 @@ def log_sequencing_report( logger.info( "Median Peptide Length: %d", run_report["median_sequence_length"] ) + + +def check_dir( + dir: pathlib.Path, file_patterns: Iterable[re.Pattern[str]] +) -> None: + """ + Check that no file names in dir match any of file_patterns + + Parameters + ---------- + dir : pathlib.Path + The directory to check for matching file names + file_patterns : Iterable[re.Pattern[str]] + File name re patterns to test file names against + + Raises + ------ + FileExistsError + If matching file name is found in dir + """ + for pattern in file_patterns: + comp_pattern = re.compile(pattern) + for file in dir.iterdir(): + if not file.is_file(): + continue + + if comp_pattern.fullmatch(file.name) is not None: + raise FileExistsError( + f"File {file.name} already exists in {dir} " + "and can not be overwritten." + ) From 6e043d15a6416ac6ab48837561bdaeb29675df1d Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Mon, 26 Aug 2024 15:57:40 -0700 Subject: [PATCH 03/24] file io options tests --- casanovo/casanovo.py | 16 +++++++---- casanovo/denovo/model_runner.py | 2 +- tests/test_integration.py | 50 +++++++++++++++++++++++++-------- tests/unit_tests/test_runner.py | 44 ++++++++++++++++++++++------- tests/unit_tests/test_unit.py | 12 ++++++++ 5 files changed, 96 insertions(+), 28 deletions(-) diff --git a/casanovo/casanovo.py b/casanovo/casanovo.py index f1b8bc8e..ca87729f 100644 --- a/casanovo/casanovo.py +++ b/casanovo/casanovo.py @@ -67,12 +67,12 @@ def __init__(self, *args, **kwargs) -> None: """, ), click.Option( - ("-f", "--output-dir"), + ("-f", "--output_dir"), help="The destination directory for output files", type=click.Path(dir_okay=True), ), click.Option( - ("-o", "--output-root"), + ("-o", "--output_root"), help="The root name for all output files", type=click.Path(dir_okay=False), ), @@ -171,12 +171,13 @@ def sequence( one or more annotated MGF file. """ output_dir = Path(output_dir) if output_dir is not None else Path.cwd() - output_base_name = output_dir / output_root + output_base_name = None if output_root is not None and not overwrite: + output_base_name = output_dir / output_root base_pattern = re.escape(output_root) patterns = [base_pattern + r"\.log", base_pattern + r"\.mztab"] utils.check_dir(output_dir, patterns) - output = setup_logging(str(output_base_name), verbosity) + output = setup_logging(output_base_name, verbosity) config, model = setup_model(model, config, output, False) start_time = time.time() @@ -209,13 +210,13 @@ def sequence( An annotated MGF file for validation, like from MassIVE-KB. Use this option multiple times to specify multiple files. """, - required=True, + required=False, multiple=True, type=click.Path(exists=True, dir_okay=False), ) def train( train_peak_path: Tuple[str], - validation_peak_path: Tuple[str], + validation_peak_path: Optional[Tuple[str]], model: Optional[str], config: Optional[str], output_dir: Optional[str], @@ -244,6 +245,9 @@ def train( for peak_file in train_peak_path: logger.info(" %s", peak_file) + if len(validation_peak_path) == 0: + validation_peak_path = train_peak_path + logger.info("Using the following validation files:") for peak_file in validation_peak_path: logger.info(" %s", peak_file) diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index cf443e1a..23f7f3ca 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -70,7 +70,7 @@ def __init__( ) curr_filename, best_filename = filenames - if overwrite_ckpt_check: + if overwrite_ckpt_check and output_dir is not None: patterns = [r"epoch=\d+\-step=\d+\.ckpt", r"best\.ckpt"] if output_rootname is not None: patterns = [ diff --git a/tests/test_integration.py b/tests/test_integration.py index 47cee936..ded8294d 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -3,6 +3,7 @@ from pathlib import Path import pyteomics.mztab +import pytest from click.testing import CliRunner from casanovo import casanovo @@ -29,28 +30,40 @@ def test_train_and_run( str(mgf_small), "--config", tiny_config, - "--output", - str(tmp_path / "train"), + "--output_dir", + str(tmp_path), + "--output_root", + "train", str(mgf_small), # The training files. ] result = run(train_args) - model_file = tmp_path / "epoch=19-step=20.ckpt" - best_model = tmp_path / "best.ckpt" + model_file = tmp_path / "train.epoch=19-step=20.ckpt" + best_model = tmp_path / "train.best.ckpt" assert result.exit_code == 0 assert model_file.exists() assert best_model.exists() - # Finally try predicting: - output_filename = tmp_path / "test.mztab" + # Check that re-running train fails due to no overwrite + with pytest.raises(FileExistsError): + run(train_args) + + assert model_file.exists() + assert best_model.exists() + + # Try predicting: + output_rootname = "test" + output_filename = (tmp_path / output_rootname).with_suffix(".mztab") predict_args = [ "sequence", "--model", str(model_file), "--config", tiny_config, - "--output", - str(output_filename), + "--output_dir", + str(tmp_path), + "--output_root", + output_rootname, str(mgf_small), str(mzml_small), ] @@ -77,16 +90,25 @@ def test_train_and_run( assert psms.loc[4, "sequence"] == "PEPTLDEK" assert psms.loc[4, "spectra_ref"] == "ms_run[2]:scan=111" + # Verify that running predict again fails due to no overwrite + with pytest.raises(FileExistsError): + run(predict_args) + + assert output_filename.is_file() + # Finally, try evaluating: - output_filename = tmp_path / "test-eval.mztab" + output_rootname = "test-eval" + output_filename = (tmp_path / output_rootname).with_suffix(".mztab") eval_args = [ "sequence", "--model", str(model_file), "--config", tiny_config, - "--output", - str(output_filename), + "--output_dir", + str(tmp_path), + "--output_root", + output_rootname, str(mgf_small), str(mzml_small), "--evaluate", @@ -134,6 +156,12 @@ def test_train_and_run( ] ) + # Verify that running again fails due to no overwrite + with pytest.raises(FileExistsError): + run(eval_args) + + assert output_filename.is_file() + def test_auxilliary_cli(tmp_path, monkeypatch): """Test the secondary CLI commands""" diff --git a/tests/unit_tests/test_runner.py b/tests/unit_tests/test_runner.py index db21725a..742c35b4 100644 --- a/tests/unit_tests/test_runner.py +++ b/tests/unit_tests/test_runner.py @@ -161,16 +161,26 @@ def test_save_final_model(tmp_path, mgf_small, tiny_config): config = Config(tiny_config) config.val_check_interval = 50 model_file = tmp_path / "epoch=19-step=20.ckpt" - with ModelRunner(config) as runner: + with ModelRunner(config, output_dir=tmp_path) as runner: runner.train([mgf_small], [mgf_small]) + assert model_file.exists() + + # Test that training again raises file exists error + with pytest.raises(FileExistsError): + with ModelRunner(config, output_dir=tmp_path) as runner: + runner.train([mgf_small], [mgf_small]) + assert model_file.exists() Path.unlink(model_file) # Test checkpoint saving when val_check_interval is not a factor of training steps config.val_check_interval = 15 validation_file = tmp_path / "foobar.best.ckpt" - with ModelRunner(config, output_rootname="foobar") as runner: + model_file = tmp_path / "foobar.epoch=19-step=20.ckpt" + with ModelRunner( + config, output_dir=tmp_path, output_rootname="foobar" + ) as runner: runner.train([mgf_small], [mgf_small]) assert model_file.exists() @@ -185,14 +195,16 @@ def test_evaluate( config = Config(tiny_config) config.max_epochs = 1 model_file = tmp_path / "epoch=0-step=1.ckpt" - with ModelRunner(config) as runner: + with ModelRunner(config, output_dir=tmp_path) as runner: runner.train([mgf_small], [mgf_small]) assert model_file.is_file() # Test evaluation with annotated peak file result_file = tmp_path / "result.mztab" - with ModelRunner(config, model_filename=str(model_file)) as runner: + with ModelRunner( + config, model_filename=str(model_file), overwrite_ckpt_check=False + ) as runner: runner.predict([mgf_small], result_file, evaluate=True) assert result_file.is_file() @@ -205,15 +217,21 @@ def test_evaluate( ) with pytest.raises(FileNotFoundError): - with ModelRunner(config, model_filename=str(model_file)) as runner: + with ModelRunner( + config, model_filename=str(model_file), overwrite_ckpt_check=False + ) as runner: runner.predict([mzml_small], result_file, evaluate=True) with pytest.raises(TypeError, match=exception_string): - with ModelRunner(config, model_filename=str(model_file)) as runner: + with ModelRunner( + config, model_filename=str(model_file), overwrite_ckpt_check=False + ) as runner: runner.predict([mgf_small_unannotated], result_file, evaluate=True) with pytest.raises(TypeError, match=exception_string): - with ModelRunner(config, model_filename=str(model_file)) as runner: + with ModelRunner( + config, model_filename=str(model_file), overwrite_ckpt_check=False + ) as runner: runner.predict( [mgf_small_unannotated, mzml_small], result_file, evaluate=True ) @@ -225,14 +243,18 @@ def test_evaluate( # Test mix of annotated an unannotated peak files with pytest.warns(RuntimeWarning): - with ModelRunner(config, model_filename=str(model_file)) as runner: + with ModelRunner( + config, model_filename=str(model_file), overwrite_ckpt_check=False + ) as runner: runner.predict([mgf_small, mzml_small], result_file, evaluate=True) assert result_file.is_file() result_file.unlink() with pytest.raises(TypeError, match=exception_string): - with ModelRunner(config, model_filename=str(model_file)) as runner: + with ModelRunner( + config, model_filename=str(model_file), overwrite_ckpt_check=False + ) as runner: runner.predict( [mgf_small, mgf_small_unannotated], result_file, evaluate=True ) @@ -241,7 +263,9 @@ def test_evaluate( result_file.unlink() with pytest.raises(TypeError, match=exception_string): - with ModelRunner(config, model_filename=str(model_file)) as runner: + with ModelRunner( + config, model_filename=str(model_file), overwrite_ckpt_check=False + ) as runner: runner.predict( [mgf_small, mgf_small_unannotated, mzml_small], result_file, diff --git a/tests/unit_tests/test_unit.py b/tests/unit_tests/test_unit.py index 120e341a..33f20bf5 100644 --- a/tests/unit_tests/test_unit.py +++ b/tests/unit_tests/test_unit.py @@ -920,3 +920,15 @@ def test_run_map(mgf_small): out_writer.set_ms_run([os.path.abspath(mgf_small.name)]) assert os.path.basename(mgf_small.name) not in out_writer._run_map assert os.path.abspath(mgf_small.name) in out_writer._run_map + + +def test_check_dir(tmp_path): + exists_path = tmp_path / "exists-1234.ckpt" + exists_pattern = r"exists\-\d+\.ckpt" + exists_path.touch() + dne_pattern = r"dne\-\d+\.ckpt" + + with pytest.raises(FileExistsError): + utils.check_dir(tmp_path, [exists_pattern, dne_pattern]) + + utils.check_dir(tmp_path, [dne_pattern]) From ee88344cebf4aaf6b63452768db3199dc056664d Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Mon, 26 Aug 2024 16:02:31 -0700 Subject: [PATCH 04/24] changelog entry --- CHANGELOG.md | 3 +++ tests/conftest.py | 1 - 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 12a92e38..020883a3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), ### Changed - Removed the `evaluate` sub-command, and all model evaluation functionality has been moved to the `sequence` command using the new `--evaluate` flag. +- The `--output` option has been split into two options, `--output_dir` and `--output_root`. +- The `model_save_folder_path` config option has been eliminated; model checkpoints will now be saved to `--output_dir` during training. +- The `--validation_peak_path` is now optional during training; if `--validation_peak_path` is not set then the `train_peak_path` will also be used for evaluation. ### Fixed diff --git a/tests/conftest.py b/tests/conftest.py index bff6b011..68e4214a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -214,7 +214,6 @@ def tiny_config(tmp_path): "cosine_schedule_period_iters": 1, "max_epochs": 20, "val_check_interval": 1, - "model_save_folder_path": str(tmp_path), "accelerator": "cpu", "precursor_mass_tol": 5, "isotope_error_range": [0, 1], From 4da13578e83758b2c137039091639023dcffa2c3 Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Mon, 26 Aug 2024 17:44:00 -0700 Subject: [PATCH 05/24] revised changelog --- CHANGELOG.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 020883a3..af430ec0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,8 +15,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - Removed the `evaluate` sub-command, and all model evaluation functionality has been moved to the `sequence` command using the new `--evaluate` flag. - The `--output` option has been split into two options, `--output_dir` and `--output_root`. -- The `model_save_folder_path` config option has been eliminated; model checkpoints will now be saved to `--output_dir` during training. -- The `--validation_peak_path` is now optional during training; if `--validation_peak_path` is not set then the `train_peak_path` will also be used for evaluation. +- The `--validation_peak_path` is now optional when training; if `--validation_peak_path` is not set then the `train_peak_path` will also be used for validation. ### Fixed @@ -24,7 +23,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), ### Removed -- Removed the `save_top_k` option from the Casanovo config, the model with the lowest validation loss during training will now be saved to a fixed filename `.best.ckpt`. +- Removed the `save_top_k` option from the Casanovo config, the model with the lowest validation loss during training will now be saved to a fixed filename `.best.ckpt`. +- The `model_save_folder_path` config option has been eliminated; model checkpoints will now be saved to `--output_dir` during training. ## [4.2.1] - 2024-06-25 From 653deedbf26c6fe68ce679675b64030a320d1fc1 Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Mon, 26 Aug 2024 12:09:38 -0700 Subject: [PATCH 06/24] file io console options --- casanovo/casanovo.py | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/casanovo/casanovo.py b/casanovo/casanovo.py index 0c220ddb..a84d5ea0 100644 --- a/casanovo/casanovo.py +++ b/casanovo/casanovo.py @@ -67,8 +67,13 @@ def __init__(self, *args, **kwargs) -> None: """, ), click.Option( - ("-o", "--output"), - help="The mzTab file to which results will be written.", + ("-o", "--output-dir"), + help="The destination directory for output files", + type=click.Path(dir_okay=True), + ), + click.Option( + ("-r", "--output-root"), + help="The root name for all output files", type=click.Path(dir_okay=False), ), click.Option( @@ -90,6 +95,15 @@ def __init__(self, *args, **kwargs) -> None: ), default="info", ), + click.Option( + ("-d", "--overwrite"), + help=""" + Whether to overwrite output files. + """, + is_flag=True, + show_default=True, + default=False, + ), ] @@ -144,8 +158,10 @@ def sequence( peak_path: Tuple[str], model: Optional[str], config: Optional[str], - output: Optional[str], + output_dir: Optional[str], + output_root: Optional[str], verbosity: str, + overwrite: bool, evaluate: bool, ) -> None: """De novo sequence peptides from tandem mass spectra. @@ -195,8 +211,10 @@ def train( validation_peak_path: Tuple[str], model: Optional[str], config: Optional[str], - output: Optional[str], + output_dir: Optional[str], + output_root: Optional[str], verbosity: str, + overwrite: bool, ) -> None: """Train a Casanovo model on your own data. From 837b769709d26ebeeab091f3ddd79ba116f3939f Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Mon, 26 Aug 2024 14:32:27 -0700 Subject: [PATCH 07/24] output console io options --- casanovo/casanovo.py | 27 +++++++++++++++++++------ casanovo/config.py | 2 +- casanovo/config.yaml | 2 -- casanovo/denovo/model_runner.py | 27 +++++++++++++++++++++---- casanovo/utils.py | 36 ++++++++++++++++++++++++++++++++- 5 files changed, 80 insertions(+), 14 deletions(-) diff --git a/casanovo/casanovo.py b/casanovo/casanovo.py index a84d5ea0..f1b8bc8e 100644 --- a/casanovo/casanovo.py +++ b/casanovo/casanovo.py @@ -67,12 +67,12 @@ def __init__(self, *args, **kwargs) -> None: """, ), click.Option( - ("-o", "--output-dir"), + ("-f", "--output-dir"), help="The destination directory for output files", type=click.Path(dir_okay=True), ), click.Option( - ("-r", "--output-root"), + ("-o", "--output-root"), help="The root name for all output files", type=click.Path(dir_okay=False), ), @@ -170,10 +170,17 @@ def sequence( to sequence peptides. If evaluate is set to True PEAK_PATH must be one or more annotated MGF file. """ - output = setup_logging(output, verbosity) + output_dir = Path(output_dir) if output_dir is not None else Path.cwd() + output_base_name = output_dir / output_root + if output_root is not None and not overwrite: + base_pattern = re.escape(output_root) + patterns = [base_pattern + r"\.log", base_pattern + r"\.mztab"] + utils.check_dir(output_dir, patterns) + output = setup_logging(str(output_base_name), verbosity) + config, model = setup_model(model, config, output, False) start_time = time.time() - with ModelRunner(config, model) as runner: + with ModelRunner(config, model, output_root, output_dir, False) as runner: logger.info( "Sequencing %speptides from:", "and evaluating " if evaluate else "", @@ -221,10 +228,18 @@ def train( TRAIN_PEAK_PATH must be one or more annoated MGF files, such as those provided by MassIVE-KB, from which to train a new Casnovo model. """ - output = setup_logging(output, verbosity) + output_dir = Path(output_dir) if output_dir is not None else Path.cwd() + output_base_name = None + if output_root is not None and not overwrite: + output_base_name = output_dir / output_root + utils.check_dir(output_dir, [re.escape(output_root) + r"\.log"]) + output = setup_logging(output_base_name, verbosity) + config, model = setup_model(model, config, output, True) start_time = time.time() - with ModelRunner(config, model) as runner: + with ModelRunner( + config, model, output_root, output_dir, not overwrite + ) as runner: logger.info("Training a model from:") for peak_file in train_peak_path: logger.info(" %s", peak_file) diff --git a/casanovo/config.py b/casanovo/config.py index 453f7b15..dae46606 100644 --- a/casanovo/config.py +++ b/casanovo/config.py @@ -19,6 +19,7 @@ every_n_train_steps="val_check_interval", max_iters="cosine_schedule_period_iters", save_top_k=None, + model_save_folder_path=None, ) @@ -75,7 +76,6 @@ class Config: top_match=int, max_epochs=int, num_sanity_val_steps=int, - model_save_folder_path=str, val_check_interval=int, calculate_precision=bool, accelerator=str, diff --git a/casanovo/config.yaml b/casanovo/config.yaml index 3beb5f30..1fcc77b4 100644 --- a/casanovo/config.yaml +++ b/casanovo/config.yaml @@ -42,8 +42,6 @@ random_seed: 454 n_log: 1 # Tensorboard directory to use for keeping track of training metrics. tb_summarywriter: -# Path to saved checkpoints. -model_save_folder_path: "" # Model validation and checkpointing frequency in training steps. val_check_interval: 50_000 diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index 0f39fe46..d34e2285 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -4,6 +4,7 @@ import glob import logging import os +import re import tempfile import uuid import warnings @@ -18,6 +19,7 @@ from lightning.pytorch.strategies import DDPStrategy from lightning.pytorch.callbacks import ModelCheckpoint +from .. import utils from ..config import Config from ..data import ms_io from ..denovo.dataloaders import DeNovoDataModule @@ -47,6 +49,8 @@ def __init__( config: Config, model_filename: Optional[str] = None, output_rootname: Optional[str] = None, + output_dir: Optional[str] = None, + overwrite_ckpt_check: bool = True, ) -> None: """Initialize a ModelRunner""" self.config = config @@ -59,20 +63,35 @@ def __init__( self.loaders = None self.writer = None - best_filename = "best" + filenames = ("{epoch}-{step}", "best") if output_rootname is not None: - best_filename = f"{output_rootname}.{best_filename}" + filenames = tuple( + f"{output_rootname}.{curr_name}" for curr_name in filenames + ) + curr_filename, best_filename = filenames + + if overwrite_ckpt_check: + patterns = [r"epoch=\d+\-step=\d+\.ckpt", r"best\.ckpt"] + if output_rootname is not None: + patterns = [ + re.escape(output_rootname + ".") + pattern + for pattern in patterns + ] + utils.check_dir(output_dir, patterns) # Configure checkpoints. self.callbacks = [ ModelCheckpoint( - dirpath=config.model_save_folder_path, + dirpath=output_dir, save_on_train_epoch_end=True, + filename=curr_filename, + enable_version_counter=False, ), ModelCheckpoint( - dirpath=config.model_save_folder_path, + dirpath=output_dir, monitor="valid_CELoss", filename=best_filename, + enable_version_counter=False, ), ] diff --git a/casanovo/utils.py b/casanovo/utils.py index 0d2698ce..3d778791 100644 --- a/casanovo/utils.py +++ b/casanovo/utils.py @@ -2,12 +2,13 @@ import logging import os +import pathlib import platform import re import socket import sys from datetime import datetime -from typing import Tuple, Dict, List, Optional +from typing import Tuple, Dict, List, Optional, Iterable import numpy as np import pandas as pd @@ -203,6 +204,8 @@ def log_sequencing_report( """ Log sequencing run report + Parameters + ---------- next_prediction : Tuple[ str, Tuple[str, str], float, float, float, float, str ] @@ -251,3 +254,34 @@ def log_sequencing_report( logger.info( "Median Peptide Length: %d", run_report["median_sequence_length"] ) + + +def check_dir( + dir: pathlib.Path, file_patterns: Iterable[re.Pattern[str]] +) -> None: + """ + Check that no file names in dir match any of file_patterns + + Parameters + ---------- + dir : pathlib.Path + The directory to check for matching file names + file_patterns : Iterable[re.Pattern[str]] + File name re patterns to test file names against + + Raises + ------ + FileExistsError + If matching file name is found in dir + """ + for pattern in file_patterns: + comp_pattern = re.compile(pattern) + for file in dir.iterdir(): + if not file.is_file(): + continue + + if comp_pattern.fullmatch(file.name) is not None: + raise FileExistsError( + f"File {file.name} already exists in {dir} " + "and can not be overwritten." + ) From 2483d67e886170687d05f6959fcb08b81646d1d4 Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Mon, 26 Aug 2024 15:57:40 -0700 Subject: [PATCH 08/24] file io options tests --- casanovo/casanovo.py | 16 +++++++---- casanovo/denovo/model_runner.py | 2 +- tests/test_integration.py | 50 +++++++++++++++++++++++++-------- tests/unit_tests/test_runner.py | 44 ++++++++++++++++++++++------- tests/unit_tests/test_unit.py | 12 ++++++++ 5 files changed, 96 insertions(+), 28 deletions(-) diff --git a/casanovo/casanovo.py b/casanovo/casanovo.py index f1b8bc8e..ca87729f 100644 --- a/casanovo/casanovo.py +++ b/casanovo/casanovo.py @@ -67,12 +67,12 @@ def __init__(self, *args, **kwargs) -> None: """, ), click.Option( - ("-f", "--output-dir"), + ("-f", "--output_dir"), help="The destination directory for output files", type=click.Path(dir_okay=True), ), click.Option( - ("-o", "--output-root"), + ("-o", "--output_root"), help="The root name for all output files", type=click.Path(dir_okay=False), ), @@ -171,12 +171,13 @@ def sequence( one or more annotated MGF file. """ output_dir = Path(output_dir) if output_dir is not None else Path.cwd() - output_base_name = output_dir / output_root + output_base_name = None if output_root is not None and not overwrite: + output_base_name = output_dir / output_root base_pattern = re.escape(output_root) patterns = [base_pattern + r"\.log", base_pattern + r"\.mztab"] utils.check_dir(output_dir, patterns) - output = setup_logging(str(output_base_name), verbosity) + output = setup_logging(output_base_name, verbosity) config, model = setup_model(model, config, output, False) start_time = time.time() @@ -209,13 +210,13 @@ def sequence( An annotated MGF file for validation, like from MassIVE-KB. Use this option multiple times to specify multiple files. """, - required=True, + required=False, multiple=True, type=click.Path(exists=True, dir_okay=False), ) def train( train_peak_path: Tuple[str], - validation_peak_path: Tuple[str], + validation_peak_path: Optional[Tuple[str]], model: Optional[str], config: Optional[str], output_dir: Optional[str], @@ -244,6 +245,9 @@ def train( for peak_file in train_peak_path: logger.info(" %s", peak_file) + if len(validation_peak_path) == 0: + validation_peak_path = train_peak_path + logger.info("Using the following validation files:") for peak_file in validation_peak_path: logger.info(" %s", peak_file) diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index d34e2285..bf30fad8 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -70,7 +70,7 @@ def __init__( ) curr_filename, best_filename = filenames - if overwrite_ckpt_check: + if overwrite_ckpt_check and output_dir is not None: patterns = [r"epoch=\d+\-step=\d+\.ckpt", r"best\.ckpt"] if output_rootname is not None: patterns = [ diff --git a/tests/test_integration.py b/tests/test_integration.py index 47cee936..ded8294d 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -3,6 +3,7 @@ from pathlib import Path import pyteomics.mztab +import pytest from click.testing import CliRunner from casanovo import casanovo @@ -29,28 +30,40 @@ def test_train_and_run( str(mgf_small), "--config", tiny_config, - "--output", - str(tmp_path / "train"), + "--output_dir", + str(tmp_path), + "--output_root", + "train", str(mgf_small), # The training files. ] result = run(train_args) - model_file = tmp_path / "epoch=19-step=20.ckpt" - best_model = tmp_path / "best.ckpt" + model_file = tmp_path / "train.epoch=19-step=20.ckpt" + best_model = tmp_path / "train.best.ckpt" assert result.exit_code == 0 assert model_file.exists() assert best_model.exists() - # Finally try predicting: - output_filename = tmp_path / "test.mztab" + # Check that re-running train fails due to no overwrite + with pytest.raises(FileExistsError): + run(train_args) + + assert model_file.exists() + assert best_model.exists() + + # Try predicting: + output_rootname = "test" + output_filename = (tmp_path / output_rootname).with_suffix(".mztab") predict_args = [ "sequence", "--model", str(model_file), "--config", tiny_config, - "--output", - str(output_filename), + "--output_dir", + str(tmp_path), + "--output_root", + output_rootname, str(mgf_small), str(mzml_small), ] @@ -77,16 +90,25 @@ def test_train_and_run( assert psms.loc[4, "sequence"] == "PEPTLDEK" assert psms.loc[4, "spectra_ref"] == "ms_run[2]:scan=111" + # Verify that running predict again fails due to no overwrite + with pytest.raises(FileExistsError): + run(predict_args) + + assert output_filename.is_file() + # Finally, try evaluating: - output_filename = tmp_path / "test-eval.mztab" + output_rootname = "test-eval" + output_filename = (tmp_path / output_rootname).with_suffix(".mztab") eval_args = [ "sequence", "--model", str(model_file), "--config", tiny_config, - "--output", - str(output_filename), + "--output_dir", + str(tmp_path), + "--output_root", + output_rootname, str(mgf_small), str(mzml_small), "--evaluate", @@ -134,6 +156,12 @@ def test_train_and_run( ] ) + # Verify that running again fails due to no overwrite + with pytest.raises(FileExistsError): + run(eval_args) + + assert output_filename.is_file() + def test_auxilliary_cli(tmp_path, monkeypatch): """Test the secondary CLI commands""" diff --git a/tests/unit_tests/test_runner.py b/tests/unit_tests/test_runner.py index db21725a..742c35b4 100644 --- a/tests/unit_tests/test_runner.py +++ b/tests/unit_tests/test_runner.py @@ -161,16 +161,26 @@ def test_save_final_model(tmp_path, mgf_small, tiny_config): config = Config(tiny_config) config.val_check_interval = 50 model_file = tmp_path / "epoch=19-step=20.ckpt" - with ModelRunner(config) as runner: + with ModelRunner(config, output_dir=tmp_path) as runner: runner.train([mgf_small], [mgf_small]) + assert model_file.exists() + + # Test that training again raises file exists error + with pytest.raises(FileExistsError): + with ModelRunner(config, output_dir=tmp_path) as runner: + runner.train([mgf_small], [mgf_small]) + assert model_file.exists() Path.unlink(model_file) # Test checkpoint saving when val_check_interval is not a factor of training steps config.val_check_interval = 15 validation_file = tmp_path / "foobar.best.ckpt" - with ModelRunner(config, output_rootname="foobar") as runner: + model_file = tmp_path / "foobar.epoch=19-step=20.ckpt" + with ModelRunner( + config, output_dir=tmp_path, output_rootname="foobar" + ) as runner: runner.train([mgf_small], [mgf_small]) assert model_file.exists() @@ -185,14 +195,16 @@ def test_evaluate( config = Config(tiny_config) config.max_epochs = 1 model_file = tmp_path / "epoch=0-step=1.ckpt" - with ModelRunner(config) as runner: + with ModelRunner(config, output_dir=tmp_path) as runner: runner.train([mgf_small], [mgf_small]) assert model_file.is_file() # Test evaluation with annotated peak file result_file = tmp_path / "result.mztab" - with ModelRunner(config, model_filename=str(model_file)) as runner: + with ModelRunner( + config, model_filename=str(model_file), overwrite_ckpt_check=False + ) as runner: runner.predict([mgf_small], result_file, evaluate=True) assert result_file.is_file() @@ -205,15 +217,21 @@ def test_evaluate( ) with pytest.raises(FileNotFoundError): - with ModelRunner(config, model_filename=str(model_file)) as runner: + with ModelRunner( + config, model_filename=str(model_file), overwrite_ckpt_check=False + ) as runner: runner.predict([mzml_small], result_file, evaluate=True) with pytest.raises(TypeError, match=exception_string): - with ModelRunner(config, model_filename=str(model_file)) as runner: + with ModelRunner( + config, model_filename=str(model_file), overwrite_ckpt_check=False + ) as runner: runner.predict([mgf_small_unannotated], result_file, evaluate=True) with pytest.raises(TypeError, match=exception_string): - with ModelRunner(config, model_filename=str(model_file)) as runner: + with ModelRunner( + config, model_filename=str(model_file), overwrite_ckpt_check=False + ) as runner: runner.predict( [mgf_small_unannotated, mzml_small], result_file, evaluate=True ) @@ -225,14 +243,18 @@ def test_evaluate( # Test mix of annotated an unannotated peak files with pytest.warns(RuntimeWarning): - with ModelRunner(config, model_filename=str(model_file)) as runner: + with ModelRunner( + config, model_filename=str(model_file), overwrite_ckpt_check=False + ) as runner: runner.predict([mgf_small, mzml_small], result_file, evaluate=True) assert result_file.is_file() result_file.unlink() with pytest.raises(TypeError, match=exception_string): - with ModelRunner(config, model_filename=str(model_file)) as runner: + with ModelRunner( + config, model_filename=str(model_file), overwrite_ckpt_check=False + ) as runner: runner.predict( [mgf_small, mgf_small_unannotated], result_file, evaluate=True ) @@ -241,7 +263,9 @@ def test_evaluate( result_file.unlink() with pytest.raises(TypeError, match=exception_string): - with ModelRunner(config, model_filename=str(model_file)) as runner: + with ModelRunner( + config, model_filename=str(model_file), overwrite_ckpt_check=False + ) as runner: runner.predict( [mgf_small, mgf_small_unannotated, mzml_small], result_file, diff --git a/tests/unit_tests/test_unit.py b/tests/unit_tests/test_unit.py index 120e341a..33f20bf5 100644 --- a/tests/unit_tests/test_unit.py +++ b/tests/unit_tests/test_unit.py @@ -920,3 +920,15 @@ def test_run_map(mgf_small): out_writer.set_ms_run([os.path.abspath(mgf_small.name)]) assert os.path.basename(mgf_small.name) not in out_writer._run_map assert os.path.abspath(mgf_small.name) in out_writer._run_map + + +def test_check_dir(tmp_path): + exists_path = tmp_path / "exists-1234.ckpt" + exists_pattern = r"exists\-\d+\.ckpt" + exists_path.touch() + dne_pattern = r"dne\-\d+\.ckpt" + + with pytest.raises(FileExistsError): + utils.check_dir(tmp_path, [exists_pattern, dne_pattern]) + + utils.check_dir(tmp_path, [dne_pattern]) From bf14f2ba079f2d4df968fa4dd999cd439c4fc772 Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Mon, 26 Aug 2024 16:02:31 -0700 Subject: [PATCH 09/24] changelog entry --- CHANGELOG.md | 3 +++ tests/conftest.py | 1 - 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 12a92e38..020883a3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), ### Changed - Removed the `evaluate` sub-command, and all model evaluation functionality has been moved to the `sequence` command using the new `--evaluate` flag. +- The `--output` option has been split into two options, `--output_dir` and `--output_root`. +- The `model_save_folder_path` config option has been eliminated; model checkpoints will now be saved to `--output_dir` during training. +- The `--validation_peak_path` is now optional during training; if `--validation_peak_path` is not set then the `train_peak_path` will also be used for evaluation. ### Fixed diff --git a/tests/conftest.py b/tests/conftest.py index bff6b011..68e4214a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -214,7 +214,6 @@ def tiny_config(tmp_path): "cosine_schedule_period_iters": 1, "max_epochs": 20, "val_check_interval": 1, - "model_save_folder_path": str(tmp_path), "accelerator": "cpu", "precursor_mass_tol": 5, "isotope_error_range": [0, 1], From 1903cbcb3395982913acda3b100ab67ee5f05fbc Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Mon, 26 Aug 2024 17:44:00 -0700 Subject: [PATCH 10/24] revised changelog --- CHANGELOG.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 020883a3..af430ec0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,8 +15,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - Removed the `evaluate` sub-command, and all model evaluation functionality has been moved to the `sequence` command using the new `--evaluate` flag. - The `--output` option has been split into two options, `--output_dir` and `--output_root`. -- The `model_save_folder_path` config option has been eliminated; model checkpoints will now be saved to `--output_dir` during training. -- The `--validation_peak_path` is now optional during training; if `--validation_peak_path` is not set then the `train_peak_path` will also be used for evaluation. +- The `--validation_peak_path` is now optional when training; if `--validation_peak_path` is not set then the `train_peak_path` will also be used for validation. ### Fixed @@ -24,7 +23,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), ### Removed -- Removed the `save_top_k` option from the Casanovo config, the model with the lowest validation loss during training will now be saved to a fixed filename `.best.ckpt`. +- Removed the `save_top_k` option from the Casanovo config, the model with the lowest validation loss during training will now be saved to a fixed filename `.best.ckpt`. +- The `model_save_folder_path` config option has been eliminated; model checkpoints will now be saved to `--output_dir` during training. ## [4.2.1] - 2024-06-25 From 90ef08b3126820114306ef8b4061b97a953392bb Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Tue, 27 Aug 2024 00:52:57 +0000 Subject: [PATCH 11/24] Generate new screengrabs with rich-codex --- docs/images/sequence-help.svg | 206 ++++++++++++++++++--------------- docs/images/train-help.svg | 210 ++++++++++++++++++---------------- 2 files changed, 224 insertions(+), 192 deletions(-) diff --git a/docs/images/sequence-help.svg b/docs/images/sequence-help.svg index 70570e2a..a707a97c 100644 --- a/docs/images/sequence-help.svg +++ b/docs/images/sequence-help.svg @@ -1,4 +1,4 @@ - + - - + + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + + + + + + + + + + + + + + + + + + + - + - + - - $ casanovo sequence --help - -Usage:casanovo sequence [OPTIONSPEAK_PATH...                                 - - De novo sequence peptides from tandem mass spectra.                             - PEAK_PATH must be one or more mzML, mzXML, or MGF files from which to sequence  - peptides. If evaluate is set to True PEAK_PATH must be one or more annotated    - MGF file.                                                                       - -╭─ Arguments ──────────────────────────────────────────────────────────────────╮ -*  PEAK_PATH    FILE[required] -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ Options ────────────────────────────────────────────────────────────────────╮ ---evaluate-e  Run in evaluation mode. When     -                                              this flag is set the peptide     -                                              and amino acid precision will    -                                              be calculated and logged at the  -                                              end of the sequencing run. All   -                                              input files must be annotated    -                                              MGF files if running in          -                                              evaluation mode.                 ---model-mTEXT                        Either the model weights (.ckpt  -                                              file) or a URL pointing to the   -                                              model weights file. If not       -                                              provided, Casanovo will try to   -                                              download the latest release      -                                              automatically.                   ---output-oFILE                        The mzTab file to which results  -                                              will be written.                 ---config-cFILE                        The YAML configuration file      -                                              overriding the default options.  ---verbosity-v[debug|info|warning|error]  Set the verbosity of console     -                                              logging messages. Log files are  -                                              always set to 'debug'.           ---help-h  Show this message and exit.      -╰──────────────────────────────────────────────────────────────────────────────╯ - + + $ casanovo sequence --help + +Usage:casanovo sequence [OPTIONSPEAK_PATH...                                 + + De novo sequence peptides from tandem mass spectra.                             + PEAK_PATH must be one or more mzML, mzXML, or MGF files from which to sequence  + peptides. If evaluate is set to True PEAK_PATH must be one or more annotated    + MGF file.                                                                       + +╭─ Arguments ──────────────────────────────────────────────────────────────────╮ +*  PEAK_PATH    FILE[required] +╰──────────────────────────────────────────────────────────────────────────────╯ +╭─ Options ────────────────────────────────────────────────────────────────────╮ +--evaluate-e  Run in evaluation mode. When   +                                                this flag is set the peptide   +                                                and amino acid precision will  +                                                be calculated and logged at    +                                                the end of the sequencing      +                                                run. All input files must be   +                                                annotated MGF files if         +                                                running in evaluation mode.    +--model-mTEXT                        Either the model weights       +                                                (.ckpt file) or a URL          +                                                pointing to the model weights  +                                                file. If not provided,         +                                                Casanovo will try to download  +                                                the latest release             +                                                automatically.                 +--output_dir-fPATH                        The destination directory for  +                                                output files                   +--output_root-oFILE                        The root name for all output   +                                                files                          +--config-cFILE                        The YAML configuration file    +                                                overriding the default         +                                                options.                       +--verbosity-v[debug|info|warning|error]  Set the verbosity of console   +                                                logging messages. Log files    +                                                are always set to 'debug'.     +--overwrite-d  Whether to overwrite output    +                                                files.                         +--help-h  Show this message and exit.    +╰──────────────────────────────────────────────────────────────────────────────╯ + diff --git a/docs/images/train-help.svg b/docs/images/train-help.svg index e27717e1..f5fd20a7 100644 --- a/docs/images/train-help.svg +++ b/docs/images/train-help.svg @@ -1,4 +1,4 @@ - + - - + + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + + + + + + + - + - + - - $ casanovo train --help - -Usage:casanovo train [OPTIONSTRAIN_PEAK_PATH...                              - - Train a Casanovo model on your own data.                                        - TRAIN_PEAK_PATH must be one or more annoated MGF files, such as those provided  - by MassIVE-KB, from which to train a new Casnovo model.                         - -╭─ Arguments ──────────────────────────────────────────────────────────────────╮ -*  TRAIN_PEAK_PATH    FILE[required] -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ Options ────────────────────────────────────────────────────────────────────╮ -*--validation_peak_pa…-pFILE                    An annotated MGF file   -                                                       for validation, like    -                                                       from MassIVE-KB. Use    -                                                       this option multiple    -                                                       times to specify        -                                                       multiple files.         -[required]             ---model-mTEXT                    Either the model        -                                                       weights (.ckpt file)    -                                                       or a URL pointing to    -                                                       the model weights       -                                                       file. If not provided,  -                                                       Casanovo will try to    -                                                       download the latest     -                                                       release automatically.  ---output-oFILE                    The mzTab file to       -                                                       which results will be   -                                                       written.                ---config-cFILE                    The YAML configuration  -                                                       file overriding the     -                                                       default options.        ---verbosity-v[debug|info|warning|er  Set the verbosity of    -ror]  console logging         -                                                       messages. Log files     -                                                       are always set to       -                                                       'debug'.                ---help-h  Show this message and   -                                                       exit.                   -╰──────────────────────────────────────────────────────────────────────────────╯ - + + $ casanovo train --help + +Usage:casanovo train [OPTIONSTRAIN_PEAK_PATH...                              + + Train a Casanovo model on your own data.                                        + TRAIN_PEAK_PATH must be one or more annoated MGF files, such as those provided  + by MassIVE-KB, from which to train a new Casnovo model.                         + +╭─ Arguments ──────────────────────────────────────────────────────────────────╮ +*  TRAIN_PEAK_PATH    FILE[required] +╰──────────────────────────────────────────────────────────────────────────────╯ +╭─ Options ────────────────────────────────────────────────────────────────────╮ +--validation_peak_path-pFILE                    An annotated MGF file     +                                                     for validation, like      +                                                     from MassIVE-KB. Use      +                                                     this option multiple      +                                                     times to specify          +                                                     multiple files.           +--model-mTEXT                    Either the model weights  +                                                     (.ckpt file) or a URL     +                                                     pointing to the model     +                                                     weights file. If not      +                                                     provided, Casanovo will   +                                                     try to download the       +                                                     latest release            +                                                     automatically.            +--output_dir-fPATH                    The destination           +                                                     directory for output      +                                                     files                     +--output_root-oFILE                    The root name for all     +                                                     output files              +--config-cFILE                    The YAML configuration    +                                                     file overriding the       +                                                     default options.          +--verbosity-v[debug|info|warning|er  Set the verbosity of      +ror]  console logging           +                                                     messages. Log files are   +                                                     always set to 'debug'.    +--overwrite-d  Whether to overwrite      +                                                     output files.             +--help-h  Show this message and     +                                                     exit.                     +╰──────────────────────────────────────────────────────────────────────────────╯ + From 970adb68985b16777e52a5687ca7fd016dca608d Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Thu, 29 Aug 2024 13:19:39 -0700 Subject: [PATCH 12/24] requested changes --- casanovo/casanovo.py | 36 ++++++++------ casanovo/denovo/model_runner.py | 14 +++--- tests/test_integration.py | 86 ++++++++------------------------- 3 files changed, 47 insertions(+), 89 deletions(-) diff --git a/casanovo/casanovo.py b/casanovo/casanovo.py index ca87729f..bdeb9766 100644 --- a/casanovo/casanovo.py +++ b/casanovo/casanovo.py @@ -67,7 +67,7 @@ def __init__(self, *args, **kwargs) -> None: """, ), click.Option( - ("-f", "--output_dir"), + ("-d", "--output_dir"), help="The destination directory for output files", type=click.Path(dir_okay=True), ), @@ -96,10 +96,8 @@ def __init__(self, *args, **kwargs) -> None: default="info", ), click.Option( - ("-d", "--overwrite"), - help=""" - Whether to overwrite output files. - """, + ("-f", "--overwrite"), + help="Whether to overwrite output files.", is_flag=True, show_default=True, default=False, @@ -170,14 +168,10 @@ def sequence( to sequence peptides. If evaluate is set to True PEAK_PATH must be one or more annotated MGF file. """ - output_dir = Path(output_dir) if output_dir is not None else Path.cwd() - output_base_name = None + output, output_dir = _resolve_output(output_dir, output_root, verbosity) if output_root is not None and not overwrite: - output_base_name = output_dir / output_root - base_pattern = re.escape(output_root) - patterns = [base_pattern + r"\.log", base_pattern + r"\.mztab"] - utils.check_dir(output_dir, patterns) - output = setup_logging(output_base_name, verbosity) + file_pattern = re.escape(output_root) + r"\.(?:log|mztab)" + utils.check_dir(output_dir, [file_pattern]) config, model = setup_model(model, config, output, False) start_time = time.time() @@ -229,12 +223,9 @@ def train( TRAIN_PEAK_PATH must be one or more annoated MGF files, such as those provided by MassIVE-KB, from which to train a new Casnovo model. """ - output_dir = Path(output_dir) if output_dir is not None else Path.cwd() - output_base_name = None + output, output_dir = _resolve_output(output_dir, output_root, verbosity) if output_root is not None and not overwrite: - output_base_name = output_dir / output_root utils.check_dir(output_dir, [re.escape(output_root) + r"\.log"]) - output = setup_logging(output_base_name, verbosity) config, model = setup_model(model, config, output, True) start_time = time.time() @@ -526,6 +517,19 @@ def _get_model_weights(cache_dir: Path) -> str: ) +def _resolve_output( + output_dir: str | None, + output_root: str | None, + verbosity: str, +) -> Tuple[Path, str]: + output_dir = Path(output_dir) if output_dir is not None else Path.cwd() + output_base_name = ( + None if output_root is None else (output_dir / output_root) + ) + output = setup_logging(output_base_name, verbosity) + return output, output_dir + + def _get_weights_from_url( file_url: str, cache_dir: Path, diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index 23f7f3ca..b3deb911 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -63,14 +63,14 @@ def __init__( self.loaders = None self.writer = None - filenames = ("{epoch}-{step}", "best") - if output_rootname is not None: - filenames = tuple( - f"{output_rootname}.{curr_name}" for curr_name in filenames - ) - curr_filename, best_filename = filenames + output_dir = Path.cwd() if output_dir is None else output_dir + prefix = f"{output_rootname}." if output_rootname is not None else "" + curr_filename, best_filename = ( + prefix + "{epoch}-{step}", + prefix + "best", + ) - if overwrite_ckpt_check and output_dir is not None: + if overwrite_ckpt_check: patterns = [r"epoch=\d+\-step=\d+\.ckpt", r"best\.ckpt"] if output_rootname is not None: patterns = [ diff --git a/tests/test_integration.py b/tests/test_integration.py index ded8294d..3c6718f5 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -3,7 +3,6 @@ from pathlib import Path import pyteomics.mztab -import pytest from click.testing import CliRunner from casanovo import casanovo @@ -30,40 +29,43 @@ def test_train_and_run( str(mgf_small), "--config", tiny_config, - "--output_dir", - str(tmp_path), - "--output_root", - "train", + "--output", + str(tmp_path / "train"), str(mgf_small), # The training files. ] result = run(train_args) - model_file = tmp_path / "train.epoch=19-step=20.ckpt" - best_model = tmp_path / "train.best.ckpt" + model_file = tmp_path / "epoch=19-step=20.ckpt" + best_model = tmp_path / "best.ckpt" assert result.exit_code == 0 assert model_file.exists() assert best_model.exists() - # Check that re-running train fails due to no overwrite - with pytest.raises(FileExistsError): - run(train_args) + # Try evaluating: + eval_args = [ + "evaluate", + "--model", + str(model_file), + "--config", + str(tiny_config), + "--output", + str(tmp_path / "eval"), + str(mgf_small), + ] - assert model_file.exists() - assert best_model.exists() + result = run(eval_args) + assert result.exit_code == 0 # Try predicting: - output_rootname = "test" - output_filename = (tmp_path / output_rootname).with_suffix(".mztab") + output_filename = tmp_path / "test.mztab" predict_args = [ "sequence", "--model", str(model_file), "--config", tiny_config, - "--output_dir", - str(tmp_path), - "--output_root", - output_rootname, + "--output", + str(output_filename), str(mgf_small), str(mzml_small), ] @@ -90,48 +92,6 @@ def test_train_and_run( assert psms.loc[4, "sequence"] == "PEPTLDEK" assert psms.loc[4, "spectra_ref"] == "ms_run[2]:scan=111" - # Verify that running predict again fails due to no overwrite - with pytest.raises(FileExistsError): - run(predict_args) - - assert output_filename.is_file() - - # Finally, try evaluating: - output_rootname = "test-eval" - output_filename = (tmp_path / output_rootname).with_suffix(".mztab") - eval_args = [ - "sequence", - "--model", - str(model_file), - "--config", - tiny_config, - "--output_dir", - str(tmp_path), - "--output_root", - output_rootname, - str(mgf_small), - str(mzml_small), - "--evaluate", - ] - - result = run(eval_args) - assert result.exit_code == 0 - assert output_filename.is_file() - - mztab = pyteomics.mztab.MzTab(str(output_filename)) - filename = "small.mgf" - # Verify that the input annotated peak file is listed in the metadata. - assert f"ms_run[1]-location" in mztab.metadata - assert mztab.metadata[f"ms_run[1]-location"].endswith(filename) - - # Verify that the spectrum predictions are correct - # and indexed according to the peak input file type. - psms = mztab.spectrum_match_table - assert psms.loc[1, "sequence"] == "LESLLEK" - assert psms.loc[1, "spectra_ref"] == "ms_run[1]:index=0" - assert psms.loc[2, "sequence"] == "PEPTLDEK" - assert psms.loc[2, "spectra_ref"] == "ms_run[1]:index=1" - # Validate mztab output validate_args = [ "java", @@ -156,12 +116,6 @@ def test_train_and_run( ] ) - # Verify that running again fails due to no overwrite - with pytest.raises(FileExistsError): - run(eval_args) - - assert output_filename.is_file() - def test_auxilliary_cli(tmp_path, monkeypatch): """Test the secondary CLI commands""" From e68858b17d7040cf9054b1a3640e986d4e93ea69 Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Fri, 30 Aug 2024 13:29:08 -0700 Subject: [PATCH 13/24] updated integration test --- tests/test_integration.py | 86 +++++++++++++++++++++++++-------- tests/unit_tests/test_runner.py | 14 +++--- 2 files changed, 74 insertions(+), 26 deletions(-) diff --git a/tests/test_integration.py b/tests/test_integration.py index 3c6718f5..ded8294d 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -3,6 +3,7 @@ from pathlib import Path import pyteomics.mztab +import pytest from click.testing import CliRunner from casanovo import casanovo @@ -29,43 +30,40 @@ def test_train_and_run( str(mgf_small), "--config", tiny_config, - "--output", - str(tmp_path / "train"), + "--output_dir", + str(tmp_path), + "--output_root", + "train", str(mgf_small), # The training files. ] result = run(train_args) - model_file = tmp_path / "epoch=19-step=20.ckpt" - best_model = tmp_path / "best.ckpt" + model_file = tmp_path / "train.epoch=19-step=20.ckpt" + best_model = tmp_path / "train.best.ckpt" assert result.exit_code == 0 assert model_file.exists() assert best_model.exists() - # Try evaluating: - eval_args = [ - "evaluate", - "--model", - str(model_file), - "--config", - str(tiny_config), - "--output", - str(tmp_path / "eval"), - str(mgf_small), - ] + # Check that re-running train fails due to no overwrite + with pytest.raises(FileExistsError): + run(train_args) - result = run(eval_args) - assert result.exit_code == 0 + assert model_file.exists() + assert best_model.exists() # Try predicting: - output_filename = tmp_path / "test.mztab" + output_rootname = "test" + output_filename = (tmp_path / output_rootname).with_suffix(".mztab") predict_args = [ "sequence", "--model", str(model_file), "--config", tiny_config, - "--output", - str(output_filename), + "--output_dir", + str(tmp_path), + "--output_root", + output_rootname, str(mgf_small), str(mzml_small), ] @@ -92,6 +90,48 @@ def test_train_and_run( assert psms.loc[4, "sequence"] == "PEPTLDEK" assert psms.loc[4, "spectra_ref"] == "ms_run[2]:scan=111" + # Verify that running predict again fails due to no overwrite + with pytest.raises(FileExistsError): + run(predict_args) + + assert output_filename.is_file() + + # Finally, try evaluating: + output_rootname = "test-eval" + output_filename = (tmp_path / output_rootname).with_suffix(".mztab") + eval_args = [ + "sequence", + "--model", + str(model_file), + "--config", + tiny_config, + "--output_dir", + str(tmp_path), + "--output_root", + output_rootname, + str(mgf_small), + str(mzml_small), + "--evaluate", + ] + + result = run(eval_args) + assert result.exit_code == 0 + assert output_filename.is_file() + + mztab = pyteomics.mztab.MzTab(str(output_filename)) + filename = "small.mgf" + # Verify that the input annotated peak file is listed in the metadata. + assert f"ms_run[1]-location" in mztab.metadata + assert mztab.metadata[f"ms_run[1]-location"].endswith(filename) + + # Verify that the spectrum predictions are correct + # and indexed according to the peak input file type. + psms = mztab.spectrum_match_table + assert psms.loc[1, "sequence"] == "LESLLEK" + assert psms.loc[1, "spectra_ref"] == "ms_run[1]:index=0" + assert psms.loc[2, "sequence"] == "PEPTLDEK" + assert psms.loc[2, "spectra_ref"] == "ms_run[1]:index=1" + # Validate mztab output validate_args = [ "java", @@ -116,6 +156,12 @@ def test_train_and_run( ] ) + # Verify that running again fails due to no overwrite + with pytest.raises(FileExistsError): + run(eval_args) + + assert output_filename.is_file() + def test_auxilliary_cli(tmp_path, monkeypatch): """Test the secondary CLI commands""" diff --git a/tests/unit_tests/test_runner.py b/tests/unit_tests/test_runner.py index 742c35b4..f31725ff 100644 --- a/tests/unit_tests/test_runner.py +++ b/tests/unit_tests/test_runner.py @@ -29,7 +29,7 @@ def test_initialize_model(tmp_path, mgf_small): config.max_epochs = 1 config.n_layers = 1 ckpt = tmp_path / "existing.ckpt" - with ModelRunner(config=config) as runner: + with ModelRunner(config=config, output_dir=tmp_path) as runner: runner.train([mgf_small], [mgf_small]) runner.trainer.save_checkpoint(ckpt) @@ -58,7 +58,7 @@ def test_save_and_load_weights(tmp_path, mgf_small, tiny_config): ckpt = tmp_path / "test.ckpt" mztab = tmp_path / "test.mztab" - with ModelRunner(config=config) as runner: + with ModelRunner(config=config, output_dir=tmp_path) as runner: runner.train([mgf_small], [mgf_small]) runner.trainer.save_checkpoint(ckpt) @@ -110,7 +110,7 @@ def test_save_and_load_weights_deprecated(tmp_path, mgf_small, tiny_config): config.cosine_schedule_period_iters = 5 ckpt = tmp_path / "test.ckpt" - with ModelRunner(config=config) as runner: + with ModelRunner(config=config, output_dir=tmp_path) as runner: runner.train([mgf_small], [mgf_small]) runner.trainer.save_checkpoint(ckpt) @@ -125,7 +125,9 @@ def test_save_and_load_weights_deprecated(tmp_path, mgf_small, tiny_config): runner.initialize_model(train=False) assert runner.model.cosine_schedule_period_iters == 5 # Fine-tuning. - with ModelRunner(config=config, model_filename=str(ckpt)) as runner: + with ModelRunner( + config=config, model_filename=str(ckpt), output_dir=tmp_path + ) as runner: with pytest.warns(DeprecationWarning): runner.train([mgf_small], [mgf_small]) assert "max_iters" not in runner.model.opt_kwargs @@ -139,7 +141,7 @@ def test_calculate_precision(tmp_path, mgf_small, tiny_config): config.calculate_precision = False config.tb_summarywriter = str(tmp_path) - runner = ModelRunner(config=config) + runner = ModelRunner(config=config, output_dir=tmp_path) with runner: runner.train([mgf_small], [mgf_small]) @@ -147,7 +149,7 @@ def test_calculate_precision(tmp_path, mgf_small, tiny_config): assert "valid_pep_precision" not in runner.model.history.columns config.calculate_precision = True - runner = ModelRunner(config=config) + runner = ModelRunner(config=config, output_dir=tmp_path) with runner: runner.train([mgf_small], [mgf_small]) From 3d91f813247c3d14379ced1532a4a655d080a128 Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Fri, 30 Aug 2024 16:49:13 -0700 Subject: [PATCH 14/24] requested changes --- casanovo/casanovo.py | 52 ++++++++++++++++++++++++++------- casanovo/denovo/model_runner.py | 14 ++++----- casanovo/utils.py | 24 ++++++--------- tests/test_integration.py | 15 ---------- tests/unit_tests/test_runner.py | 13 +++++++-- tests/unit_tests/test_unit.py | 4 +-- 6 files changed, 69 insertions(+), 53 deletions(-) diff --git a/casanovo/casanovo.py b/casanovo/casanovo.py index bdeb9766..f9c37f08 100644 --- a/casanovo/casanovo.py +++ b/casanovo/casanovo.py @@ -96,7 +96,7 @@ def __init__(self, *args, **kwargs) -> None: default="info", ), click.Option( - ("-f", "--overwrite"), + ("-f", "--force_overwrite"), help="Whether to overwrite output files.", is_flag=True, show_default=True, @@ -159,7 +159,7 @@ def sequence( output_dir: Optional[str], output_root: Optional[str], verbosity: str, - overwrite: bool, + force_overwrite: bool, evaluate: bool, ) -> None: """De novo sequence peptides from tandem mass spectra. @@ -168,11 +168,13 @@ def sequence( to sequence peptides. If evaluate is set to True PEAK_PATH must be one or more annotated MGF file. """ - output, output_dir = _resolve_output(output_dir, output_root, verbosity) - if output_root is not None and not overwrite: - file_pattern = re.escape(output_root) + r"\.(?:log|mztab)" - utils.check_dir(output_dir, [file_pattern]) + file_patterns = list() + if output_root is not None and not force_overwrite: + file_patterns = [f"{output_root}.log", f"{output_root}.mztab"] + output, output_dir = _resolve_output( + output_dir, output_root, file_patterns, verbosity + ) config, model = setup_model(model, config, output, False) start_time = time.time() with ModelRunner(config, model, output_root, output_dir, False) as runner: @@ -216,21 +218,24 @@ def train( output_dir: Optional[str], output_root: Optional[str], verbosity: str, - overwrite: bool, + force_overwrite: bool, ) -> None: """Train a Casanovo model on your own data. TRAIN_PEAK_PATH must be one or more annoated MGF files, such as those provided by MassIVE-KB, from which to train a new Casnovo model. """ - output, output_dir = _resolve_output(output_dir, output_root, verbosity) - if output_root is not None and not overwrite: - utils.check_dir(output_dir, [re.escape(output_root) + r"\.log"]) + file_patterns = list() + if output_root is not None and not force_overwrite: + file_patterns = [f"{output_root}.log"] + output, output_dir = _resolve_output( + output_dir, output_root, file_patterns, verbosity + ) config, model = setup_model(model, config, output, True) start_time = time.time() with ModelRunner( - config, model, output_root, output_dir, not overwrite + config, model, output_root, output_dir, not force_overwrite ) as runner: logger.info("Training a model from:") for peak_file in train_peak_path: @@ -520,12 +525,37 @@ def _get_model_weights(cache_dir: Path) -> str: def _resolve_output( output_dir: str | None, output_root: str | None, + file_patterns: list[str], verbosity: str, ) -> Tuple[Path, str]: + """ + Resolves the output directory and sets up logging. + + Parameters: + ----------- + output_dir : str | None + The path to the output directory. If `None`, the current working + directory will be used. + output_root : str | None + The base name for the output files. If `None`, no specific base name is + set, and logging will be configured accordingly to the behavior of + `setup_logging`. + file_patterns : list[str] + A list of file patterns that should be checked within the `output_dir`. + verbosity : str + The verbosity level for logging. + + Returns: + -------- + Tuple[Path, str] + The output directory and the base name for log and results files (if + applicable). + """ output_dir = Path(output_dir) if output_dir is not None else Path.cwd() output_base_name = ( None if output_root is None else (output_dir / output_root) ) + utils.check_dir(output_dir, file_patterns) output = setup_logging(output_base_name, verbosity) return output, output_dir diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index ff441179..5c8833de 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -71,13 +71,13 @@ def __init__( ) if overwrite_ckpt_check: - patterns = [r"epoch=\d+\-step=\d+\.ckpt", r"best\.ckpt"] - if output_rootname is not None: - patterns = [ - re.escape(output_rootname + ".") + pattern - for pattern in patterns - ] - utils.check_dir(output_dir, patterns) + utils.check_dir( + output_dir, + [ + f"{curr_filename.format(epoch='*', step='*')}.ckpt", + f"{best_filename}.ckpt", + ], + ) # Configure checkpoints. self.callbacks = [ diff --git a/casanovo/utils.py b/casanovo/utils.py index 3d778791..fb0c3327 100644 --- a/casanovo/utils.py +++ b/casanovo/utils.py @@ -256,9 +256,7 @@ def log_sequencing_report( ) -def check_dir( - dir: pathlib.Path, file_patterns: Iterable[re.Pattern[str]] -) -> None: +def check_dir(dir: pathlib.Path, file_patterns: Iterable[str]) -> None: """ Check that no file names in dir match any of file_patterns @@ -266,8 +264,8 @@ def check_dir( ---------- dir : pathlib.Path The directory to check for matching file names - file_patterns : Iterable[re.Pattern[str]] - File name re patterns to test file names against + file_patterns : Iterable[str] + UNIX style wildcard pattern to test file names against Raises ------ @@ -275,13 +273,9 @@ def check_dir( If matching file name is found in dir """ for pattern in file_patterns: - comp_pattern = re.compile(pattern) - for file in dir.iterdir(): - if not file.is_file(): - continue - - if comp_pattern.fullmatch(file.name) is not None: - raise FileExistsError( - f"File {file.name} already exists in {dir} " - "and can not be overwritten." - ) + matches = list(dir.glob(pattern)) + if len(matches) > 0: + raise FileExistsError( + f"File {matches[0].name} already exists in {dir} " + "and can not be overwritten." + ) diff --git a/tests/test_integration.py b/tests/test_integration.py index ded8294d..bb6ef66e 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -3,7 +3,6 @@ from pathlib import Path import pyteomics.mztab -import pytest from click.testing import CliRunner from casanovo import casanovo @@ -44,10 +43,6 @@ def test_train_and_run( assert model_file.exists() assert best_model.exists() - # Check that re-running train fails due to no overwrite - with pytest.raises(FileExistsError): - run(train_args) - assert model_file.exists() assert best_model.exists() @@ -90,12 +85,6 @@ def test_train_and_run( assert psms.loc[4, "sequence"] == "PEPTLDEK" assert psms.loc[4, "spectra_ref"] == "ms_run[2]:scan=111" - # Verify that running predict again fails due to no overwrite - with pytest.raises(FileExistsError): - run(predict_args) - - assert output_filename.is_file() - # Finally, try evaluating: output_rootname = "test-eval" output_filename = (tmp_path / output_rootname).with_suffix(".mztab") @@ -156,10 +145,6 @@ def test_train_and_run( ] ) - # Verify that running again fails due to no overwrite - with pytest.raises(FileExistsError): - run(eval_args) - assert output_filename.is_file() diff --git a/tests/unit_tests/test_runner.py b/tests/unit_tests/test_runner.py index f31725ff..8cea9021 100644 --- a/tests/unit_tests/test_runner.py +++ b/tests/unit_tests/test_runner.py @@ -121,12 +121,17 @@ def test_save_and_load_weights_deprecated(tmp_path, mgf_small, tiny_config): torch.save(ckpt_data, str(ckpt)) # Inference. - with ModelRunner(config=config, model_filename=str(ckpt)) as runner: + with ModelRunner( + config=config, model_filename=str(ckpt), overwrite_ckpt_check=False + ) as runner: runner.initialize_model(train=False) assert runner.model.cosine_schedule_period_iters == 5 # Fine-tuning. with ModelRunner( - config=config, model_filename=str(ckpt), output_dir=tmp_path + config=config, + model_filename=str(ckpt), + output_dir=tmp_path, + overwrite_ckpt_check=False, ) as runner: with pytest.warns(DeprecationWarning): runner.train([mgf_small], [mgf_small]) @@ -149,7 +154,9 @@ def test_calculate_precision(tmp_path, mgf_small, tiny_config): assert "valid_pep_precision" not in runner.model.history.columns config.calculate_precision = True - runner = ModelRunner(config=config, output_dir=tmp_path) + runner = ModelRunner( + config=config, output_dir=tmp_path, overwrite_ckpt_check=False + ) with runner: runner.train([mgf_small], [mgf_small]) diff --git a/tests/unit_tests/test_unit.py b/tests/unit_tests/test_unit.py index 33f20bf5..8cdd291d 100644 --- a/tests/unit_tests/test_unit.py +++ b/tests/unit_tests/test_unit.py @@ -924,9 +924,9 @@ def test_run_map(mgf_small): def test_check_dir(tmp_path): exists_path = tmp_path / "exists-1234.ckpt" - exists_pattern = r"exists\-\d+\.ckpt" + exists_pattern = "exists-*.ckpt" exists_path.touch() - dne_pattern = r"dne\-\d+\.ckpt" + dne_pattern = "dne-*.ckpt" with pytest.raises(FileExistsError): utils.check_dir(tmp_path, [exists_pattern, dne_pattern]) From 645a33fc020a5db94ce2de3d483eb5fcdc7c2a92 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 30 Aug 2024 23:51:54 +0000 Subject: [PATCH 15/24] Generate new screengrabs with rich-codex --- docs/images/sequence-help.svg | 222 ++++++++++++++++++---------------- docs/images/train-help.svg | 204 +++++++++++++++---------------- 2 files changed, 221 insertions(+), 205 deletions(-) diff --git a/docs/images/sequence-help.svg b/docs/images/sequence-help.svg index a707a97c..ea6ff078 100644 --- a/docs/images/sequence-help.svg +++ b/docs/images/sequence-help.svg @@ -1,4 +1,4 @@ - + - - + + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + + + + + + + + + + + + + - + - + - - $ casanovo sequence --help - -Usage:casanovo sequence [OPTIONSPEAK_PATH...                                 - - De novo sequence peptides from tandem mass spectra.                             - PEAK_PATH must be one or more mzML, mzXML, or MGF files from which to sequence  - peptides. If evaluate is set to True PEAK_PATH must be one or more annotated    - MGF file.                                                                       - -╭─ Arguments ──────────────────────────────────────────────────────────────────╮ -*  PEAK_PATH    FILE[required] -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ Options ────────────────────────────────────────────────────────────────────╮ ---evaluate-e  Run in evaluation mode. When   -                                                this flag is set the peptide   -                                                and amino acid precision will  -                                                be calculated and logged at    -                                                the end of the sequencing      -                                                run. All input files must be   -                                                annotated MGF files if         -                                                running in evaluation mode.    ---model-mTEXT                        Either the model weights       -                                                (.ckpt file) or a URL          -                                                pointing to the model weights  -                                                file. If not provided,         -                                                Casanovo will try to download  -                                                the latest release             -                                                automatically.                 ---output_dir-fPATH                        The destination directory for  -                                                output files                   ---output_root-oFILE                        The root name for all output   -                                                files                          ---config-cFILE                        The YAML configuration file    -                                                overriding the default         -                                                options.                       ---verbosity-v[debug|info|warning|error]  Set the verbosity of console   -                                                logging messages. Log files    -                                                are always set to 'debug'.     ---overwrite-d  Whether to overwrite output    -                                                files.                         ---help-h  Show this message and exit.    -╰──────────────────────────────────────────────────────────────────────────────╯ - + + $ casanovo sequence --help + +Usage:casanovo sequence [OPTIONSPEAK_PATH...                                 + + De novo sequence peptides from tandem mass spectra.                             + PEAK_PATH must be one or more mzML, mzXML, or MGF files from which to sequence  + peptides. If evaluate is set to True PEAK_PATH must be one or more annotated    + MGF file.                                                                       + +╭─ Arguments ──────────────────────────────────────────────────────────────────╮ +*  PEAK_PATH    FILE[required] +╰──────────────────────────────────────────────────────────────────────────────╯ +╭─ Options ────────────────────────────────────────────────────────────────────╮ +--evaluate-e  Run in evaluation mode.     +                                                   When this flag is set the   +                                                   peptide and amino acid      +                                                   precision will be           +                                                   calculated and logged at    +                                                   the end of the sequencing   +                                                   run. All input files must   +                                                   be annotated MGF files if   +                                                   running in evaluation       +                                                   mode.                       +--model-mTEXT                       Either the model weights    +                                                   (.ckpt file) or a URL       +                                                   pointing to the model       +                                                   weights file. If not        +                                                   provided, Casanovo will     +                                                   try to download the latest  +                                                   release automatically.      +--output_dir-dPATH                       The destination directory   +                                                   for output files            +--output_root-oFILE                       The root name for all       +                                                   output files                +--config-cFILE                       The YAML configuration      +                                                   file overriding the         +                                                   default options.            +--verbosity-v[debug|info|warning|error  Set the verbosity of        +]  console logging messages.   +                                                   Log files are always set    +                                                   to 'debug'.                 +--force_overwrite-f  Whether to overwrite        +                                                   output files.               +--help-h  Show this message and       +                                                   exit.                       +╰──────────────────────────────────────────────────────────────────────────────╯ + diff --git a/docs/images/train-help.svg b/docs/images/train-help.svg index f5fd20a7..783a0660 100644 --- a/docs/images/train-help.svg +++ b/docs/images/train-help.svg @@ -19,162 +19,162 @@ font-weight: 700; } - .terminal-249329225-matrix { + .terminal-2920970231-matrix { font-family: Fira Code, monospace; font-size: 20px; line-height: 24.4px; font-variant-east-asian: full-width; } - .terminal-249329225-title { + .terminal-2920970231-title { font-size: 18px; font-weight: bold; font-family: arial; } - .terminal-249329225-r1 { fill: #c5c8c6 } -.terminal-249329225-r2 { fill: #d0b344 } -.terminal-249329225-r3 { fill: #c5c8c6;font-weight: bold } -.terminal-249329225-r4 { fill: #68a0b3;font-weight: bold } -.terminal-249329225-r5 { fill: #868887 } -.terminal-249329225-r6 { fill: #cc555a } -.terminal-249329225-r7 { fill: #d0b344;font-weight: bold } -.terminal-249329225-r8 { fill: #8a4346 } -.terminal-249329225-r9 { fill: #98a84b;font-weight: bold } -.terminal-249329225-r10 { fill: #8d7b39;font-weight: bold } + .terminal-2920970231-r1 { fill: #c5c8c6 } +.terminal-2920970231-r2 { fill: #d0b344 } +.terminal-2920970231-r3 { fill: #c5c8c6;font-weight: bold } +.terminal-2920970231-r4 { fill: #68a0b3;font-weight: bold } +.terminal-2920970231-r5 { fill: #868887 } +.terminal-2920970231-r6 { fill: #cc555a } +.terminal-2920970231-r7 { fill: #d0b344;font-weight: bold } +.terminal-2920970231-r8 { fill: #8a4346 } +.terminal-2920970231-r9 { fill: #98a84b;font-weight: bold } +.terminal-2920970231-r10 { fill: #8d7b39;font-weight: bold } - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + @@ -186,53 +186,53 @@ - + - - $ casanovo train --help - -Usage:casanovo train [OPTIONSTRAIN_PEAK_PATH...                              - - Train a Casanovo model on your own data.                                        - TRAIN_PEAK_PATH must be one or more annoated MGF files, such as those provided  - by MassIVE-KB, from which to train a new Casnovo model.                         - -╭─ Arguments ──────────────────────────────────────────────────────────────────╮ -*  TRAIN_PEAK_PATH    FILE[required] -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ Options ────────────────────────────────────────────────────────────────────╮ ---validation_peak_path-pFILE                    An annotated MGF file     -                                                     for validation, like      -                                                     from MassIVE-KB. Use      -                                                     this option multiple      -                                                     times to specify          -                                                     multiple files.           ---model-mTEXT                    Either the model weights  -                                                     (.ckpt file) or a URL     -                                                     pointing to the model     -                                                     weights file. If not      -                                                     provided, Casanovo will   -                                                     try to download the       -                                                     latest release            -                                                     automatically.            ---output_dir-fPATH                    The destination           -                                                     directory for output      -                                                     files                     ---output_root-oFILE                    The root name for all     -                                                     output files              ---config-cFILE                    The YAML configuration    -                                                     file overriding the       -                                                     default options.          ---verbosity-v[debug|info|warning|er  Set the verbosity of      -ror]  console logging           -                                                     messages. Log files are   -                                                     always set to 'debug'.    ---overwrite-d  Whether to overwrite      -                                                     output files.             ---help-h  Show this message and     -                                                     exit.                     -╰──────────────────────────────────────────────────────────────────────────────╯ - + + $ casanovo train --help + +Usage:casanovo train [OPTIONSTRAIN_PEAK_PATH...                              + + Train a Casanovo model on your own data.                                        + TRAIN_PEAK_PATH must be one or more annoated MGF files, such as those provided  + by MassIVE-KB, from which to train a new Casnovo model.                         + +╭─ Arguments ──────────────────────────────────────────────────────────────────╮ +*  TRAIN_PEAK_PATH    FILE[required] +╰──────────────────────────────────────────────────────────────────────────────╯ +╭─ Options ────────────────────────────────────────────────────────────────────╮ +--validation_peak_path-pFILE                    An annotated MGF file     +                                                     for validation, like      +                                                     from MassIVE-KB. Use      +                                                     this option multiple      +                                                     times to specify          +                                                     multiple files.           +--model-mTEXT                    Either the model weights  +                                                     (.ckpt file) or a URL     +                                                     pointing to the model     +                                                     weights file. If not      +                                                     provided, Casanovo will   +                                                     try to download the       +                                                     latest release            +                                                     automatically.            +--output_dir-dPATH                    The destination           +                                                     directory for output      +                                                     files                     +--output_root-oFILE                    The root name for all     +                                                     output files              +--config-cFILE                    The YAML configuration    +                                                     file overriding the       +                                                     default options.          +--verbosity-v[debug|info|warning|er  Set the verbosity of      +ror]  console logging           +                                                     messages. Log files are   +                                                     always set to 'debug'.    +--force_overwrite-f  Whether to overwrite      +                                                     output files.             +--help-h  Show this message and     +                                                     exit.                     +╰──────────────────────────────────────────────────────────────────────────────╯ + From 2d6dd00fbdb80a15cadc887bb8043bd1c8e069ed Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Tue, 3 Sep 2024 12:18:26 -0700 Subject: [PATCH 16/24] requested changes, output setup refactor --- casanovo/casanovo.py | 111 ++++++++++++++++---------------- casanovo/denovo/model_runner.py | 43 ++++++++----- casanovo/utils.py | 18 ++++-- tests/unit_tests/test_unit.py | 32 ++++++--- 4 files changed, 114 insertions(+), 90 deletions(-) diff --git a/casanovo/casanovo.py b/casanovo/casanovo.py index f9c37f08..d146d5f5 100644 --- a/casanovo/casanovo.py +++ b/casanovo/casanovo.py @@ -12,7 +12,7 @@ import urllib.parse import warnings from pathlib import Path -from typing import Optional, Tuple +from typing import Optional, Tuple, List warnings.formatwarning = lambda message, category, *args, **kwargs: ( f"{category.__name__}: {message}" @@ -168,16 +168,13 @@ def sequence( to sequence peptides. If evaluate is set to True PEAK_PATH must be one or more annotated MGF file. """ - file_patterns = list() - if output_root is not None and not force_overwrite: - file_patterns = [f"{output_root}.log", f"{output_root}.mztab"] - - output, output_dir = _resolve_output( - output_dir, output_root, file_patterns, verbosity + output_path, output_root = _setup_output( + output_dir, output_root, force_overwrite, verbosity ) - config, model = setup_model(model, config, output, False) + utils.check_dir_file_exists(output_path, f"{output_root}.mztab") + config, model = setup_model(model, config, output_dir, output_root, False) start_time = time.time() - with ModelRunner(config, model, output_root, output_dir, False) as runner: + with ModelRunner(config, model, output_path, output_root, False) as runner: logger.info( "Sequencing %speptides from:", "and evaluating " if evaluate else "", @@ -185,7 +182,11 @@ def sequence( for peak_file in peak_path: logger.info(" %s", peak_file) - runner.predict(peak_path, output, evaluate=evaluate) + runner.predict( + peak_path, + str((output_path / output_root).with_suffix(".mztab")), + evaluate=evaluate, + ) psms = runner.writer.psms utils.log_sequencing_report( psms, start_time=start_time, end_time=time.time() @@ -225,17 +226,13 @@ def train( TRAIN_PEAK_PATH must be one or more annoated MGF files, such as those provided by MassIVE-KB, from which to train a new Casnovo model. """ - file_patterns = list() - if output_root is not None and not force_overwrite: - file_patterns = [f"{output_root}.log"] - - output, output_dir = _resolve_output( - output_dir, output_root, file_patterns, verbosity + output_path, output_root = _setup_output( + output_dir, output_root, force_overwrite, verbosity ) - config, model = setup_model(model, config, output, True) + config, model = setup_model(model, config, output_path, output_root, True) start_time = time.time() with ModelRunner( - config, model, output_root, output_dir, not force_overwrite + config, model, output_path, output_root, not force_overwrite ) as runner: logger.info("Training a model from:") for peak_file in train_peak_path: @@ -283,7 +280,7 @@ def configure(output: str) -> None: def setup_logging( - output: Optional[str], + log_file_path: Path, verbosity: str, ) -> Path: """Set up the logger. @@ -292,21 +289,11 @@ def setup_logging( Parameters ---------- - output : Optional[str] - The provided output file name. + log_file_path: Path + The log file path. verbosity : str The logging level to use in the console. - - Return - ------ - output : Path - The output file path. """ - if output is None: - output = f"casanovo_{datetime.datetime.now().strftime('%Y%m%d%H%M%S')}" - - output = Path(output).expanduser().resolve() - logging_levels = { "debug": logging.DEBUG, "info": logging.INFO, @@ -333,9 +320,7 @@ def setup_logging( console_handler.setFormatter(console_formatter) root_logger.addHandler(console_handler) warnings_logger.addHandler(console_handler) - file_handler = logging.FileHandler( - output.with_suffix(".log"), encoding="utf8" - ) + file_handler = logging.FileHandler(log_file_path, encoding="utf8") file_handler.setFormatter(log_formatter) root_logger.addHandler(file_handler) warnings_logger.addHandler(file_handler) @@ -352,13 +337,12 @@ def setup_logging( logging.getLogger("torch").setLevel(logging.WARNING) logging.getLogger("urllib3").setLevel(logging.WARNING) - return output - def setup_model( model: Optional[str], config: Optional[str], - output: Optional[Path], + output_dir: Optional[Path | str], + output_root_name: Optional[str], is_train: bool, ) -> Config: """Setup Casanovo for most commands. @@ -418,7 +402,8 @@ def setup_model( logger.info("Casanovo version %s", str(__version__)) logger.debug("model = %s", model) logger.debug("config = %s", config.file) - logger.debug("output = %s", output) + logger.debug("output directory = %s", output_dir) + logger.debug("output root name = %s", output_root_name) for key, value in config.items(): logger.debug("%s = %s", str(key), str(value)) @@ -522,42 +507,54 @@ def _get_model_weights(cache_dir: Path) -> str: ) -def _resolve_output( +def _setup_output( output_dir: str | None, output_root: str | None, - file_patterns: list[str], + overwrite: bool, verbosity: str, ) -> Tuple[Path, str]: """ - Resolves the output directory and sets up logging. + Set up the output directory, output file root name, and logging. Parameters: ----------- output_dir : str | None - The path to the output directory. If `None`, the current working - directory will be used. + The path to the output directory. If `None`, the output directory will + be resolved to the current working directory. output_root : str | None - The base name for the output files. If `None`, no specific base name is - set, and logging will be configured accordingly to the behavior of - `setup_logging`. - file_patterns : list[str] - A list of file patterns that should be checked within the `output_dir`. + The base name for the output files. If `None` the output root name will + be resolved to casanovo_ + overwrite: bool + Whether to overwrite log file if it already exists in the output + directory. verbosity : str The verbosity level for logging. Returns: -------- Tuple[Path, str] - The output directory and the base name for log and results files (if - applicable). + A tuple containing the resolved output directory and root name for + output files. """ - output_dir = Path(output_dir) if output_dir is not None else Path.cwd() - output_base_name = ( - None if output_root is None else (output_dir / output_root) - ) - utils.check_dir(output_dir, file_patterns) - output = setup_logging(output_base_name, verbosity) - return output, output_dir + if output_root is None: + output_root = ( + f"casanovo_{datetime.datetime.now().strftime('%Y%m%d%H%M%S')}" + ) + + if output_dir is None: + output_path = Path.cwd() + else: + output_path = Path(output_dir) + if not output_path.is_dir(): + raise FileNotFoundError( + f"Target output directory {output_dir} does not exists." + ) + + if not overwrite: + utils.check_dir_file_exists(output_path, f"{output_root}.log") + + setup_logging((output_path / output_root).with_suffix(".log"), verbosity) + return output_path, output_root def _get_weights_from_url( diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index 5c8833de..a7302565 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -40,17 +40,21 @@ class ModelRunner: model_filename : str, optional The model filename is required for eval and de novo modes, but not for training a model from scratch. - output_rootname : str, optional - The rootname for all output files (e.g. checkpoints or results) + output_dir : Path | None, optional + The directory where checkpoint files will be saved. If `None` no + checkpoint files will be saved and a warning will be triggered. + output_rootname : str | None, optional + The root name for checkpoint files (e.g., checkpoints or results). If + `None` no base name will be used for checkpoint files. """ def __init__( self, config: Config, model_filename: Optional[str] = None, - output_rootname: Optional[str] = None, - output_dir: Optional[str] = None, - overwrite_ckpt_check: bool = True, + output_dir: Optional[Path | None] = None, + output_rootname: Optional[str | None] = None, + overwrite_ckpt_check: Optional[bool] = True, ) -> None: """Initialize a ModelRunner""" self.config = config @@ -63,15 +67,19 @@ def __init__( self.loaders = None self.writer = None - output_dir = Path.cwd() if output_dir is None else output_dir - prefix = f"{output_rootname}." if output_rootname is not None else "" - curr_filename, best_filename = ( - prefix + "{epoch}-{step}", - prefix + "best", - ) + if output_dir is None: + self.callbacks = [] + warnings.warn( + "Checkpoint directory not set in ModelRunner, " + "no checkpoint files will be saved." + ) + return + prefix = f"{output_rootname}." if output_rootname is not None else "" + curr_filename = prefix + "{epoch}-{step}" + best_filename = prefix + "best" if overwrite_ckpt_check: - utils.check_dir( + utils.check_dir_file_exists( output_dir, [ f"{curr_filename.format(epoch='*', step='*')}.ckpt", @@ -167,7 +175,10 @@ def log_metrics(self, test_index: AnnotatedSpectrumIndex) -> None: logger.info("Amino Acid Precision: %.2f%%", 100 * aa_precision) def predict( - self, peak_path: Iterable[str], output: str, evaluate: bool = False + self, + peak_path: Iterable[str], + results_path: str, + evaluate: bool = False, ) -> None: """Predict peptide sequences with a trained Casanovo model. @@ -178,8 +189,8 @@ def predict( ---------- peak_path : iterable of str The path with the MS data files for predicting peptide sequences. - output : str - Where should the output be saved? + results_path : str + Sequencing results file path evaluate: bool whether to run model evaluation in addition to inference Note: peak_path most point to annotated MS data files when @@ -190,7 +201,7 @@ def predict( ------- self """ - self.writer = ms_io.MztabWriter(Path(output).with_suffix(".mztab")) + self.writer = ms_io.MztabWriter(results_path) self.writer.set_metadata( self.config, model=str(self.model_filename), diff --git a/casanovo/utils.py b/casanovo/utils.py index fb0c3327..1161b5eb 100644 --- a/casanovo/utils.py +++ b/casanovo/utils.py @@ -256,7 +256,9 @@ def log_sequencing_report( ) -def check_dir(dir: pathlib.Path, file_patterns: Iterable[str]) -> None: +def check_dir_file_exists( + dir: pathlib.Path, file_patterns: Iterable[str] | str +) -> None: """ Check that no file names in dir match any of file_patterns @@ -264,18 +266,20 @@ def check_dir(dir: pathlib.Path, file_patterns: Iterable[str]) -> None: ---------- dir : pathlib.Path The directory to check for matching file names - file_patterns : Iterable[str] - UNIX style wildcard pattern to test file names against + file_patterns : Iterable[str] | str + UNIX style wildcard pattern(s) to test file names against Raises ------ FileExistsError If matching file name is found in dir """ + if isinstance(file_patterns, str): + file_patterns = [file_patterns] + for pattern in file_patterns: - matches = list(dir.glob(pattern)) - if len(matches) > 0: + if next(dir.glob(pattern), None) is not None: raise FileExistsError( - f"File {matches[0].name} already exists in {dir} " - "and can not be overwritten." + f"File matching wildcard pattern {pattern} already exist in" + f"{dir} and can not be overwritten." ) diff --git a/tests/unit_tests/test_unit.py b/tests/unit_tests/test_unit.py index 8cdd291d..f221d502 100644 --- a/tests/unit_tests/test_unit.py +++ b/tests/unit_tests/test_unit.py @@ -169,14 +169,14 @@ def test_setup_model(monkeypatch): filename = pathlib.Path(tmp_dir) / "casanovo_massivekb_v3_0_0.ckpt" assert not filename.is_file() - _, result_path = casanovo.setup_model(None, None, None, False) + _, result_path = casanovo.setup_model(None, None, None, None, False) assert result_path.resolve() == filename.resolve() assert filename.is_file() assert mock_get.request_counter == 1 os.remove(result_path) assert not filename.is_file() - _, result = casanovo.setup_model(None, None, None, True) + _, result = casanovo.setup_model(None, None, None, None, True) assert result is None assert not filename.is_file() assert mock_get.request_counter == 1 @@ -195,14 +195,18 @@ def test_setup_model(monkeypatch): cache_file_path = cache_file_dir / cache_file_name assert not cache_file_path.is_file() - _, result_path = casanovo.setup_model(file_url, None, None, False) + _, result_path = casanovo.setup_model( + file_url, None, None, None, False + ) assert cache_file_path.is_file() assert result_path.resolve() == cache_file_path.resolve() assert mock_get.request_counter == 2 os.remove(result_path) assert not cache_file_path.is_file() - _, result_path = casanovo.setup_model(file_url, None, None, False) + _, result_path = casanovo.setup_model( + file_url, None, None, None, False + ) assert cache_file_path.is_file() assert result_path.resolve() == cache_file_path.resolve() assert mock_get.request_counter == 3 @@ -217,11 +221,15 @@ def test_setup_model(monkeypatch): mnk.setattr(requests, "get", mock_get) temp_file_path = temp_file.name - _, result = casanovo.setup_model(temp_file_path, None, None, False) + _, result = casanovo.setup_model( + temp_file_path, None, None, None, False + ) assert mock_get.request_counter == 3 assert result == temp_file_path - _, result = casanovo.setup_model(temp_file_path, None, None, True) + _, result = casanovo.setup_model( + temp_file_path, None, None, None, True + ) assert mock_get.request_counter == 3 assert result == temp_file_path @@ -233,12 +241,12 @@ def test_setup_model(monkeypatch): mnk.setattr(requests, "get", mock_get) with pytest.raises(ValueError): - casanovo.setup_model("FooBar", None, None, False) + casanovo.setup_model("FooBar", None, None, None, False) assert mock_get.request_counter == 3 with pytest.raises(ValueError): - casanovo.setup_model("FooBar", None, None, False) + casanovo.setup_model("FooBar", None, None, None, False) assert mock_get.request_counter == 3 @@ -929,6 +937,10 @@ def test_check_dir(tmp_path): dne_pattern = "dne-*.ckpt" with pytest.raises(FileExistsError): - utils.check_dir(tmp_path, [exists_pattern, dne_pattern]) + utils.check_dir_file_exists(tmp_path, [exists_pattern, dne_pattern]) + + with pytest.raises(FileExistsError): + utils.check_dir_file_exists(tmp_path, exists_pattern) - utils.check_dir(tmp_path, [dne_pattern]) + utils.check_dir_file_exists(tmp_path, [dne_pattern]) + utils.check_dir_file_exists(tmp_path, dne_pattern) From 1ee28be03739737b7b0195b052e8cce45423b835 Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Tue, 3 Sep 2024 12:22:10 -0700 Subject: [PATCH 17/24] ModelRunner documentation --- casanovo/denovo/model_runner.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index a7302565..adc4c8cc 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -46,6 +46,9 @@ class ModelRunner: output_rootname : str | None, optional The root name for checkpoint files (e.g., checkpoints or results). If `None` no base name will be used for checkpoint files. + overwrite_ckpt_check: bool, optional + Whether to check output_dir (if not `None`) for conflicting checkpoint + files. """ def __init__( From 4cb18e1cc182997fa76aa26d626bed1498637156 Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Wed, 4 Sep 2024 12:09:16 -0700 Subject: [PATCH 18/24] requested changes, _setup_output unit test --- casanovo/casanovo.py | 10 ++++++---- casanovo/denovo/model_runner.py | 2 +- tests/unit_tests/test_unit.py | 20 ++++++++++++++++++++ 3 files changed, 27 insertions(+), 5 deletions(-) diff --git a/casanovo/casanovo.py b/casanovo/casanovo.py index d146d5f5..95b3a183 100644 --- a/casanovo/casanovo.py +++ b/casanovo/casanovo.py @@ -523,7 +523,7 @@ def _setup_output( be resolved to the current working directory. output_root : str | None The base name for the output files. If `None` the output root name will - be resolved to casanovo_ + be resolved to casanovo_ overwrite: bool Whether to overwrite log file if it already exists in the output directory. @@ -544,10 +544,12 @@ def _setup_output( if output_dir is None: output_path = Path.cwd() else: - output_path = Path(output_dir) + output_path = Path(output_dir).expanduser().resolve() if not output_path.is_dir(): - raise FileNotFoundError( - f"Target output directory {output_dir} does not exists." + output_path.mkdir(parents=True) + warnings.warn( + f"Target output directory {output_dir} does not exists, " + "so it will be created." ) if not overwrite: diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index adc4c8cc..73d1da77 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -72,7 +72,7 @@ def __init__( if output_dir is None: self.callbacks = [] - warnings.warn( + logger.warning( "Checkpoint directory not set in ModelRunner, " "no checkpoint files will be saved." ) diff --git a/tests/unit_tests/test_unit.py b/tests/unit_tests/test_unit.py index f221d502..9ad8490d 100644 --- a/tests/unit_tests/test_unit.py +++ b/tests/unit_tests/test_unit.py @@ -7,6 +7,7 @@ import os import pathlib import platform +import re import requests import shutil import tempfile @@ -944,3 +945,22 @@ def test_check_dir(tmp_path): utils.check_dir_file_exists(tmp_path, [dne_pattern]) utils.check_dir_file_exists(tmp_path, dne_pattern) + + +def test_setup_output(tmp_path, monkeypatch): + with monkeypatch.context() as mnk: + mnk.setattr(pathlib.Path, "cwd", lambda: tmp_path) + output_path, output_root = casanovo._setup_output( + None, None, False, "info" + ) + assert output_path.resolve() == tmp_path.resolve() + assert re.fullmatch(r"casanovo_\d+", output_root) is not None + + target_path = tmp_path / "foo" + with pytest.warns(UserWarning): + output_path, output_root = casanovo._setup_output( + str(target_path), "bar", False, "info" + ) + + assert output_path.resolve() == target_path.resolve() + assert output_root == "bar" From e36a4c94d3340ffd3dda7a52f904ef9a99b47baa Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Thu, 5 Sep 2024 14:38:16 -0700 Subject: [PATCH 19/24] write tensorboard to output directory --- casanovo/config.py | 2 +- casanovo/config.yaml | 4 ++-- casanovo/denovo/model_runner.py | 15 +++++++++++++-- 3 files changed, 16 insertions(+), 5 deletions(-) diff --git a/casanovo/config.py b/casanovo/config.py index dae46606..e1670aa3 100644 --- a/casanovo/config.py +++ b/casanovo/config.py @@ -64,7 +64,7 @@ class Config: max_length=int, residues=dict, n_log=int, - tb_summarywriter=str, + tb_summarywriter=bool, train_label_smoothing=float, warmup_iters=int, cosine_schedule_period_iters=int, diff --git a/casanovo/config.yaml b/casanovo/config.yaml index 1fcc77b4..59df016b 100644 --- a/casanovo/config.yaml +++ b/casanovo/config.yaml @@ -40,8 +40,8 @@ random_seed: 454 # OUTPUT OPTIONS # Logging frequency in training steps. n_log: 1 -# Tensorboard directory to use for keeping track of training metrics. -tb_summarywriter: +# Whether to create tensorboard directory +tb_summarywriter: false # Model validation and checkpointing frequency in training steps. val_check_interval: 50_000 diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index 73d1da77..9e94199e 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -62,6 +62,7 @@ def __init__( """Initialize a ModelRunner""" self.config = config self.model_filename = model_filename + self.output_dir = output_dir # Initialized later: self.tmp_dir = None @@ -268,6 +269,16 @@ def initialize_model(self, train: bool) -> None: Determines whether to set the model up for model training or evaluation / inference. """ + tb_summarywriter = None + if self.config.tb_summarywriter: + if self.output_dir is None: + logger.warning( + "Can not create tensorboard because the output directory " + "is not set in the model runner." + ) + else: + tb_summarywriter = self.output_dir / "tensorboard" + model_params = dict( dim_model=self.config.dim_model, n_head=self.config.n_head, @@ -284,7 +295,7 @@ def initialize_model(self, train: bool) -> None: n_beams=self.config.n_beams, top_match=self.config.top_match, n_log=self.config.n_log, - tb_summarywriter=self.config.tb_summarywriter, + tb_summarywriter=tb_summarywriter, train_label_smoothing=self.config.train_label_smoothing, warmup_iters=self.config.warmup_iters, cosine_schedule_period_iters=self.config.cosine_schedule_period_iters, @@ -303,7 +314,7 @@ def initialize_model(self, train: bool) -> None: min_peptide_len=self.config.min_peptide_len, top_match=self.config.top_match, n_log=self.config.n_log, - tb_summarywriter=self.config.tb_summarywriter, + tb_summarywriter=tb_summarywriter, train_label_smoothing=self.config.train_label_smoothing, warmup_iters=self.config.warmup_iters, cosine_schedule_period_iters=self.config.cosine_schedule_period_iters, From 503fb86082b7244256a9b9067bafbe713d2487ee Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Wed, 11 Sep 2024 17:46:39 -0700 Subject: [PATCH 20/24] ModelRunner output root bug fix, setup_model documentation, sequence output setup bug fix --- casanovo/casanovo.py | 62 +++++++++++++++++++++++------------ tests/unit_tests/test_unit.py | 7 ++-- 2 files changed, 44 insertions(+), 25 deletions(-) diff --git a/casanovo/casanovo.py b/casanovo/casanovo.py index 95b3a183..f2dd4232 100644 --- a/casanovo/casanovo.py +++ b/casanovo/casanovo.py @@ -168,13 +168,21 @@ def sequence( to sequence peptides. If evaluate is set to True PEAK_PATH must be one or more annotated MGF file. """ - output_path, output_root = _setup_output( + output_path, output_root_name = _setup_output( output_dir, output_root, force_overwrite, verbosity ) utils.check_dir_file_exists(output_path, f"{output_root}.mztab") - config, model = setup_model(model, config, output_dir, output_root, False) + config, model = setup_model( + model, config, output_path, output_root_name, False + ) start_time = time.time() - with ModelRunner(config, model, output_path, output_root, False) as runner: + with ModelRunner( + config, + model, + output_path, + output_root_name if output_root is not None else None, + False, + ) as runner: logger.info( "Sequencing %speptides from:", "and evaluating " if evaluate else "", @@ -226,13 +234,19 @@ def train( TRAIN_PEAK_PATH must be one or more annoated MGF files, such as those provided by MassIVE-KB, from which to train a new Casnovo model. """ - output_path, output_root = _setup_output( + output_path, output_root_name = _setup_output( output_dir, output_root, force_overwrite, verbosity ) - config, model = setup_model(model, config, output_path, output_root, True) + config, model = setup_model( + model, config, output_path, output_root_name, True + ) start_time = time.time() with ModelRunner( - config, model, output_path, output_root, not force_overwrite + config, + model, + output_path, + output_root_name if output_root is not None else None, + not force_overwrite, ) as runner: logger.info("Training a model from:") for peak_file in train_peak_path: @@ -339,30 +353,36 @@ def setup_logging( def setup_model( - model: Optional[str], - config: Optional[str], - output_dir: Optional[Path | str], - output_root_name: Optional[str], + model: str | None, + config: str | None, + output_dir: Path | str, + output_root_name: str, is_train: bool, -) -> Config: - """Setup Casanovo for most commands. +) -> Tuple[Config, Path | None]: + """Setup Casanovo config and resolve model weights (.ckpt) path Parameters ---------- - model : Optional[str] - The provided model weights file. - config : Optional[str] - The provided configuration file. - output : Optional[Path] - The provided output file name. + model : str | None + May be a file system path, a URL pointing to a .ckpt file, or None. + If `model` is a URL the weights will be downloaded and cached from + `model`. If `model` is `None` the weights from the latest matching + official release will be used (downloaded and cached). + config : str | None + Config file path. If None the default config will be used. + output_dir: : Path | str + The path to the output directory. + output_root_name : str, + The base name for the output files. is_train : bool Are we training? If not, we need to retrieve weights when the model is None. Return ------ - config : Config - The parsed configuration + Tuple[Config, Path] + Initialized Casanovo config, local path to model weights if any (may be + `None` if training using random starting weights). """ # Read parameters from the config file. config = Config(config) @@ -547,7 +567,7 @@ def _setup_output( output_path = Path(output_dir).expanduser().resolve() if not output_path.is_dir(): output_path.mkdir(parents=True) - warnings.warn( + logger.warning( f"Target output directory {output_dir} does not exists, " "so it will be created." ) diff --git a/tests/unit_tests/test_unit.py b/tests/unit_tests/test_unit.py index 9ad8490d..18136ab2 100644 --- a/tests/unit_tests/test_unit.py +++ b/tests/unit_tests/test_unit.py @@ -957,10 +957,9 @@ def test_setup_output(tmp_path, monkeypatch): assert re.fullmatch(r"casanovo_\d+", output_root) is not None target_path = tmp_path / "foo" - with pytest.warns(UserWarning): - output_path, output_root = casanovo._setup_output( - str(target_path), "bar", False, "info" - ) + output_path, output_root = casanovo._setup_output( + str(target_path), "bar", False, "info" + ) assert output_path.resolve() == target_path.resolve() assert output_root == "bar" From 71dc50c556d4c7d04cf13139132cfd2bca9d7ce2 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 12 Sep 2024 00:50:41 +0000 Subject: [PATCH 21/24] Generate new screengrabs with rich-codex --- docs/images/help.svg | 159 +++---------------------------------------- 1 file changed, 11 insertions(+), 148 deletions(-) diff --git a/docs/images/help.svg b/docs/images/help.svg index dbdc05e0..baf2e237 100644 --- a/docs/images/help.svg +++ b/docs/images/help.svg @@ -1,4 +1,4 @@ - + - - + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + - + - + - - $ casanovo --help - -Usage:casanovo [OPTIONSCOMMAND [ARGS]...                                     - - ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓  - ┃                                  Casanovo                                  ┃  - ┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛  - Casanovo de novo sequences peptides from tandem mass spectra using a            - Transformer model. Casanovo currently supports mzML, mzXML, and MGF files for   - de novo sequencing and annotated MGF files, such as those from MassIVE-KB, for  - training new models.                                                            - - Links:                                                                          - - • Documentation: https://casanovo.readthedocs.io                               - • Official code repository: https://github.com/Noble-Lab/casanovo              - - If you use Casanovo in your work, please cite:                                  - - • Yilmaz, M., Fondrie, W. E., Bittremieux, W., Oh, S. & Noble, W. S. De novo   -mass spectrometry peptide sequencing with a transformer model. Proceedings   -of the 39th International Conference on Machine Learning - ICML '22 (2022)   -doi:10.1101/2022.02.07.479481.                                               - -╭─ Options ────────────────────────────────────────────────────────────────────╮ ---help-h    Show this message and exit.                                     -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ Commands ───────────────────────────────────────────────────────────────────╮ -configure Generate a Casanovo configuration file to customize.               -sequence  De novo sequence peptides from tandem mass spectra.                -train     Train a Casanovo model on your own data.                           -version   Get the Casanovo version information                               -╰──────────────────────────────────────────────────────────────────────────────╯ - + + $ casanovo --help From 6ba9bb39b0e6381326424a7f76793d87d21b1fad Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Mon, 16 Sep 2024 12:00:50 -0700 Subject: [PATCH 22/24] logging format character --- casanovo/casanovo.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/casanovo/casanovo.py b/casanovo/casanovo.py index f2dd4232..c311bf50 100644 --- a/casanovo/casanovo.py +++ b/casanovo/casanovo.py @@ -568,8 +568,8 @@ def _setup_output( if not output_path.is_dir(): output_path.mkdir(parents=True) logger.warning( - f"Target output directory {output_dir} does not exists, " - "so it will be created." + "Target output directory %s does not exists, so it will be created.", + output_path, ) if not overwrite: From 4ec76c2275f80c795d64e42329465d3a6502ac5a Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Thu, 5 Sep 2024 14:38:16 -0700 Subject: [PATCH 23/24] write tensorboard to output directory --- casanovo/config.py | 2 +- casanovo/config.yaml | 4 ++-- casanovo/denovo/model_runner.py | 15 +++++++++++++-- 3 files changed, 16 insertions(+), 5 deletions(-) diff --git a/casanovo/config.py b/casanovo/config.py index dae46606..e1670aa3 100644 --- a/casanovo/config.py +++ b/casanovo/config.py @@ -64,7 +64,7 @@ class Config: max_length=int, residues=dict, n_log=int, - tb_summarywriter=str, + tb_summarywriter=bool, train_label_smoothing=float, warmup_iters=int, cosine_schedule_period_iters=int, diff --git a/casanovo/config.yaml b/casanovo/config.yaml index 1fcc77b4..59df016b 100644 --- a/casanovo/config.yaml +++ b/casanovo/config.yaml @@ -40,8 +40,8 @@ random_seed: 454 # OUTPUT OPTIONS # Logging frequency in training steps. n_log: 1 -# Tensorboard directory to use for keeping track of training metrics. -tb_summarywriter: +# Whether to create tensorboard directory +tb_summarywriter: false # Model validation and checkpointing frequency in training steps. val_check_interval: 50_000 diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index 73d1da77..9e94199e 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -62,6 +62,7 @@ def __init__( """Initialize a ModelRunner""" self.config = config self.model_filename = model_filename + self.output_dir = output_dir # Initialized later: self.tmp_dir = None @@ -268,6 +269,16 @@ def initialize_model(self, train: bool) -> None: Determines whether to set the model up for model training or evaluation / inference. """ + tb_summarywriter = None + if self.config.tb_summarywriter: + if self.output_dir is None: + logger.warning( + "Can not create tensorboard because the output directory " + "is not set in the model runner." + ) + else: + tb_summarywriter = self.output_dir / "tensorboard" + model_params = dict( dim_model=self.config.dim_model, n_head=self.config.n_head, @@ -284,7 +295,7 @@ def initialize_model(self, train: bool) -> None: n_beams=self.config.n_beams, top_match=self.config.top_match, n_log=self.config.n_log, - tb_summarywriter=self.config.tb_summarywriter, + tb_summarywriter=tb_summarywriter, train_label_smoothing=self.config.train_label_smoothing, warmup_iters=self.config.warmup_iters, cosine_schedule_period_iters=self.config.cosine_schedule_period_iters, @@ -303,7 +314,7 @@ def initialize_model(self, train: bool) -> None: min_peptide_len=self.config.min_peptide_len, top_match=self.config.top_match, n_log=self.config.n_log, - tb_summarywriter=self.config.tb_summarywriter, + tb_summarywriter=tb_summarywriter, train_label_smoothing=self.config.train_label_smoothing, warmup_iters=self.config.warmup_iters, cosine_schedule_period_iters=self.config.cosine_schedule_period_iters, From fa8ee9aef02fdfb5ecd30fa3649c5201df23ac57 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Mon, 16 Sep 2024 19:15:57 +0000 Subject: [PATCH 24/24] Generate new screengrabs with rich-codex --- docs/images/help.svg | 159 ++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 148 insertions(+), 11 deletions(-) diff --git a/docs/images/help.svg b/docs/images/help.svg index baf2e237..dbdc05e0 100644 --- a/docs/images/help.svg +++ b/docs/images/help.svg @@ -1,4 +1,4 @@ - + - - + + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - + - + - - $ casanovo --help + + $ casanovo --help + +Usage:casanovo [OPTIONSCOMMAND [ARGS]...                                     + + ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓  + ┃                                  Casanovo                                  ┃  + ┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛  + Casanovo de novo sequences peptides from tandem mass spectra using a            + Transformer model. Casanovo currently supports mzML, mzXML, and MGF files for   + de novo sequencing and annotated MGF files, such as those from MassIVE-KB, for  + training new models.                                                            + + Links:                                                                          + + • Documentation: https://casanovo.readthedocs.io                               + • Official code repository: https://github.com/Noble-Lab/casanovo              + + If you use Casanovo in your work, please cite:                                  + + • Yilmaz, M., Fondrie, W. E., Bittremieux, W., Oh, S. & Noble, W. S. De novo   +mass spectrometry peptide sequencing with a transformer model. Proceedings   +of the 39th International Conference on Machine Learning - ICML '22 (2022)   +doi:10.1101/2022.02.07.479481.                                               + +╭─ Options ────────────────────────────────────────────────────────────────────╮ +--help-h    Show this message and exit.                                     +╰──────────────────────────────────────────────────────────────────────────────╯ +╭─ Commands ───────────────────────────────────────────────────────────────────╮ +configure Generate a Casanovo configuration file to customize.               +sequence  De novo sequence peptides from tandem mass spectra.                +train     Train a Casanovo model on your own data.                           +version   Get the Casanovo version information                               +╰──────────────────────────────────────────────────────────────────────────────╯ +