Merge branch 'dev' into szsz-drop-version-engine-from-gnomad-steps

opentargets · Dec 13, 2024 · d179eb4 · d179eb4
2 parents 036ba86 + 2fdf343
commit d179eb4
Show file tree

Hide file tree

Showing 8 changed files with 1,913 additions and 1,642 deletions.
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -19,64 +19,63 @@ packages = [{ include = "gentropy", from = "src" }]
 gentropy = "gentropy.cli:main"
 
 [tool.poetry.dependencies]
-python = "^3.10, <3.11"
+python = ">=3.10, <3.11"
 pyspark = "3.3.4"
-scipy = "^1.11.4"
-hydra-core = "^1.3.2"
-pyliftover = "^0.4"
-numpy = "^1.26.2"
+scipy = ">=1.11.4, <1.12.0"
+hydra-core = ">=1.3.2, <1.4.0"
+pyliftover = ">=0.4, <0.5"
+numpy = ">=1.26.2, <1.27.0"
 hail = "0.2.127"
-wandb = ">=0.16.2,<0.19.0"
-google = "^3.0.0"
-omegaconf = "^2.3.0"
-typing-extensions = "^4.9.0"
-scikit-learn = "^1.3.2"
-pandas = { extras = ["gcp", "parquet"], version = "^2.2.2" }
-skops = ">=0.9,<0.11"
-google-cloud-secret-manager = "^2.20.0"
-shap = "^0.46.0"
-matplotlib = "3.7.3"
-
-[tool.poetry.dev-dependencies]
-pre-commit = "^4.0.0"
-mypy = "^1.13"
-pep8-naming = "^0.14.1"
-interrogate = "^1.7.0"
-isort = "^5.13.2"
-darglint = "^1.8.1"
-ruff = "^0.8.1"
+wandb = ">=0.19.0, <0.20.0"
+google = ">=3.0.0, <3.1.0"
+omegaconf = ">=2.3.0, <2.4.0"
+typing-extensions = ">=4.9.0, <4.13.0"
+scikit-learn = ">=1.6.0, <1.7.0"
+pandas = { extras = ["gcp", "parquet"], version = ">=2.2.2, <2.3.0" }
+skops = ">=0.11, <0.12"
+google-cloud-secret-manager = ">=2.20.0, <2.22.0"
+shap = ">=0.46.0, <0.47.0"
+matplotlib = ">=3.7.3, <3.8.0"
+
 
 [tool.poetry.group.docs.dependencies]
-mkdocs = "^1.5.3"
-mkdocstrings-python = "^1.8.0"
+mkdocs = ">=1.5.3, <1.6.0"
+mkdocstrings-python = ">=1.12.2, <1.13.0"
 mkdocs-material = "*"
-mkdocs-section-index = "^0.3.4"
-mkdocs-git-revision-date-localized-plugin = "^1.2.2"
-mkdocs-autolinks-plugin = "^0.7.1"
-mkdocs-awesome-pages-plugin = "^2.9.2"
-mkdocs-exclude = "^1.0.2"
-mkdocs-git-committers-plugin-2 = "^2.2.3"
-lxml = "^5.1.0"
-pymdown-extensions = "^10.7"
+mkdocs-section-index = ">=0.3.4, <0.4.0"
+mkdocs-git-revision-date-localized-plugin = ">=1.2.2, <1.4.0"
+mkdocs-autolinks-plugin = ">=0.7.1, <0.8.0"
+mkdocs-awesome-pages-plugin = ">=2.9.2, <3.0.0"
+mkdocs-exclude = ">=1.0.2, <1.1.0"
+mkdocs-git-committers-plugin-2 = ">=2.2.3, <2.5.0"
+lxml = ">=5.1.0, <5.4.0"
+pymdown-extensions = ">=10.7, <10.13"
 
 
 [tool.poetry.group.tests.dependencies]
 pytest-cov = ">=4.1,<7.0"
 pytest-sugar = ">=0.9.5,<1.1.0"
 dbldatagen = ">=0.3.1,<0.5.0"
-pyparsing = "^3.1.1"
+pyparsing = ">=3.1.1, <3.3.0"
 pytest = ">=7.4.4,<9.0.0"
-pytest-xdist = "^3.5.0"
+pytest-xdist = ">=3.5.0, <3.7.0"
 
 
 [tool.poetry.group.dev.dependencies]
-ipython = "^8.19.0"
-ipykernel = "^6.28.0"
-google-cloud-dataproc = "^5.8.0"
+ipython = ">=8.19.0, <8.31.0"
+ipykernel = ">=6.28.0, <6.30.0"
+google-cloud-dataproc = ">=5.8.0, <5.16.0"
 pydoclint = ">=0.3.8,<0.6.0"
-prettier = "^0.0.7"
+prettier = ">=0.0.7, <0.1.0"
 deptry = ">=0.12,<0.21"
-yamllint = "^1.33.0"
+yamllint = ">=1.33.0, <1.36.0"
+pre-commit = ">=4.0.0, <4.1.0"
+mypy = ">=1.13, <1.14"
+pep8-naming = ">=0.14.1, <0.15.0"
+interrogate = ">=1.7.0, <1.8.0"
+isort = ">=5.13.2, <5.14.0"
+darglint = ">=1.8.1, <1.9.0"
+ruff = ">=0.8.1, <0.9.0"
 
 [tool.semantic_release]
 logging_use_named_masks = true

diff --git a/src/gentropy/config.py b/src/gentropy/config.py
@@ -267,20 +267,24 @@ class LocusToGeneConfig(StepConfig):
             "geneCount500kb",
             "proteinGeneCount500kb",
             "credibleSetConfidence",
-            # "isProteinCoding",
         ]
     )
     hyperparameters: dict[str, Any] = field(
         default_factory=lambda: {
             "n_estimators": 100,
-            "max_depth": 5,
-            "loss": "log_loss",
+            "max_depth": 10,
+            "ccp_alpha": 0,
+            "learning_rate": 0.1,
+            "min_samples_leaf": 5,
+            "min_samples_split": 5,
+            "subsample": 1,
         }
     )
     wandb_run_name: str | None = None
     hf_hub_repo_id: str | None = "opentargets/locus_to_gene"
     hf_model_commit_message: str | None = "chore: update model"
     download_from_hub: bool = True
+    cross_validate: bool = True
     _target_: str = "gentropy.l2g.LocusToGeneStep"
 
 

diff --git a/src/gentropy/dataset/l2g_prediction.py b/src/gentropy/dataset/l2g_prediction.py
@@ -129,12 +129,13 @@ def to_disease_target_evidence(
         )
 
     def add_locus_to_gene_features(
-        self: L2GPrediction, feature_matrix: L2GFeatureMatrix
+        self: L2GPrediction, feature_matrix: L2GFeatureMatrix, features_list: list[str]
     ) -> L2GPrediction:
-        """Add features to the L2G predictions.
+        """Add features used to extract the L2G predictions.
 
         Args:
             feature_matrix (L2GFeatureMatrix): Feature matrix dataset
+            features_list (list[str]): List of features used in the model
 
         Returns:
             L2GPrediction: L2G predictions with additional features
@@ -143,38 +144,26 @@ def add_locus_to_gene_features(
         if "locusToGeneFeatures" in self.df.columns:
             self.df = self.df.drop("locusToGeneFeatures")
 
-        # Columns identifying a studyLocus/gene pair
-        prediction_id_columns = ["studyLocusId", "geneId"]
-
-        # L2G matrix columns to build the map:
-        columns_to_map = [
-            column
-            for column in feature_matrix._df.columns
-            if column not in prediction_id_columns
-        ]
-
         # Aggregating all features into a single map column:
         aggregated_features = (
             feature_matrix._df.withColumn(
                 "locusToGeneFeatures",
                 f.create_map(
                     *sum(
-                        [
-                            (f.lit(colname), f.col(colname))
-                            for colname in columns_to_map
-                        ],
+                        ((f.lit(feature), f.col(feature)) for feature in features_list),
                         (),
                     )
                 ),
             )
-            # from the freshly created map, we filter out the null values
             .withColumn(
                 "locusToGeneFeatures",
-                f.expr("map_filter(locusToGeneFeatures, (k, v) -> v is not null)"),
+                f.expr("map_filter(locusToGeneFeatures, (k, v) -> v != 0)"),
             )
-            .drop(*columns_to_map)
+            .drop(*features_list)
         )
         return L2GPrediction(
-            _df=self.df.join(aggregated_features, on=prediction_id_columns, how="left"),
+            _df=self.df.join(
+                aggregated_features, on=["studyLocusId", "geneId"], how="left"
+            ),
             _schema=self.get_schema(),
         )
diff --git a/src/gentropy/l2g.py b/src/gentropy/l2g.py
@@ -7,7 +7,7 @@
 
 import pyspark.sql.functions as f
 from sklearn.ensemble import GradientBoostingClassifier
-from wandb import login as wandb_login
+from wandb.sdk.wandb_login import login as wandb_login
 
 from gentropy.common.schemas import compare_struct_schemas
 from gentropy.common.session import Session
@@ -100,11 +100,12 @@ class LocusToGeneStep:
     def __init__(
         self,
         session: Session,
-        hyperparameters: dict[str, Any],
         *,
         run_mode: str,
         features_list: list[str],
+        hyperparameters: dict[str, Any],
         download_from_hub: bool,
+        cross_validate: bool,
         wandb_run_name: str,
         credible_set_path: str,
         feature_matrix_path: str,
@@ -113,18 +114,19 @@ def __init__(
         variant_index_path: str | None = None,
         gene_interactions_path: str | None = None,
         predictions_path: str | None = None,
-        l2g_threshold: float | None,
-        hf_hub_repo_id: str | None,
+        l2g_threshold: float | None = None,
+        hf_hub_repo_id: str | None = None,
         hf_model_commit_message: str | None = "chore: update model",
     ) -> None:
         """Initialise the step and run the logic based on mode.
 
         Args:
             session (Session): Session object that contains the Spark session
-            hyperparameters (dict[str, Any]): Hyperparameters for the model
             run_mode (str): Run mode, either 'train' or 'predict'
             features_list (list[str]): List of features to use for the model
+            hyperparameters (dict[str, Any]): Hyperparameters for the model
             download_from_hub (bool): Whether to download the model from Hugging Face Hub
+            cross_validate (bool): Whether to run cross validation (5-fold by default) to train the model.
             wandb_run_name (str): Name of the run to track model training in Weights and Biases
             credible_set_path (str): Path to the credible set dataset necessary to build the feature matrix
             feature_matrix_path (str): Path to the L2G feature matrix input dataset
@@ -152,6 +154,7 @@ def __init__(
         self.features_list = list(features_list)
         self.hyperparameters = dict(hyperparameters)
         self.wandb_run_name = wandb_run_name
+        self.cross_validate = cross_validate
         self.hf_hub_repo_id = hf_hub_repo_id
         self.download_from_hub = download_from_hub
         self.hf_model_commit_message = hf_model_commit_message
@@ -285,9 +288,11 @@ def run_predict(self) -> None:
         )
         predictions.filter(
             f.col("score") >= self.l2g_threshold
-        ).add_locus_to_gene_features(self.feature_matrix).df.coalesce(
-            self.session.output_partitions
-        ).write.mode(self.session.write_mode).parquet(self.predictions_path)
+        ).add_locus_to_gene_features(
+            self.feature_matrix, self.features_list
+        ).df.coalesce(self.session.output_partitions).write.mode(
+            self.session.write_mode
+        ).parquet(self.predictions_path)
         self.session.logger.info("L2G predictions saved successfully.")
 
     def run_train(self) -> None:
@@ -298,7 +303,7 @@ def run_train(self) -> None:
 
         # Instantiate classifier and train model
         l2g_model = LocusToGeneModel(
-            model=GradientBoostingClassifier(random_state=42),
+            model=GradientBoostingClassifier(random_state=42, loss="log_loss"),
             hyperparameters=self.hyperparameters,
         )
 
@@ -308,7 +313,7 @@ def run_train(self) -> None:
         # Run the training
         trained_model = LocusToGeneTrainer(
             model=l2g_model, feature_matrix=feature_matrix
-        ).train(self.wandb_run_name)
+        ).train(self.wandb_run_name, cross_validate=self.cross_validate)
 
         # Export the model
         if trained_model.training_data and trained_model.model and self.model_path:

diff --git a/src/gentropy/method/l2g/model.py b/src/gentropy/method/l2g/model.py
@@ -27,7 +27,17 @@ class LocusToGeneModel:
     """Wrapper for the Locus to Gene classifier."""
 
     model: Any = GradientBoostingClassifier(random_state=42)
-    hyperparameters: dict[str, Any] | None = None
+    hyperparameters: dict[str, Any] = field(
+        default_factory=lambda: {
+            "n_estimators": 100,
+            "max_depth": 10,
+            "ccp_alpha": 0,
+            "learning_rate": 0.1,
+            "min_samples_leaf": 5,
+            "min_samples_split": 5,
+            "subsample": 1,
+        }
+    )
     training_data: L2GFeatureMatrix | None = None
     label_encoder: dict[str, int] = field(
         default_factory=lambda: {
@@ -38,8 +48,7 @@ class LocusToGeneModel:
 
     def __post_init__(self: LocusToGeneModel) -> None:
         """Post-initialisation to fit the estimator with the provided params."""
-        if self.hyperparameters:
-            self.model.set_params(**self.hyperparameters_dict)
+        self.model.set_params(**self.hyperparameters_dict)
 
     @classmethod
     def load_from_disk(cls: Type[LocusToGeneModel], path: str) -> LocusToGeneModel: