Skip to content

Commit

Permalink
Merge branch 'dev' into szsz-drop-version-engine-from-gnomad-steps
Browse files Browse the repository at this point in the history
  • Loading branch information
project-defiant authored Dec 13, 2024
2 parents 036ba86 + 2fdf343 commit d179eb4
Show file tree
Hide file tree
Showing 8 changed files with 1,913 additions and 1,642 deletions.
2,970 changes: 1,522 additions & 1,448 deletions poetry.lock

Large diffs are not rendered by default.

81 changes: 40 additions & 41 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,64 +19,63 @@ packages = [{ include = "gentropy", from = "src" }]
gentropy = "gentropy.cli:main"

[tool.poetry.dependencies]
python = "^3.10, <3.11"
python = ">=3.10, <3.11"
pyspark = "3.3.4"
scipy = "^1.11.4"
hydra-core = "^1.3.2"
pyliftover = "^0.4"
numpy = "^1.26.2"
scipy = ">=1.11.4, <1.12.0"
hydra-core = ">=1.3.2, <1.4.0"
pyliftover = ">=0.4, <0.5"
numpy = ">=1.26.2, <1.27.0"
hail = "0.2.127"
wandb = ">=0.16.2,<0.19.0"
google = "^3.0.0"
omegaconf = "^2.3.0"
typing-extensions = "^4.9.0"
scikit-learn = "^1.3.2"
pandas = { extras = ["gcp", "parquet"], version = "^2.2.2" }
skops = ">=0.9,<0.11"
google-cloud-secret-manager = "^2.20.0"
shap = "^0.46.0"
matplotlib = "3.7.3"

[tool.poetry.dev-dependencies]
pre-commit = "^4.0.0"
mypy = "^1.13"
pep8-naming = "^0.14.1"
interrogate = "^1.7.0"
isort = "^5.13.2"
darglint = "^1.8.1"
ruff = "^0.8.1"
wandb = ">=0.19.0, <0.20.0"
google = ">=3.0.0, <3.1.0"
omegaconf = ">=2.3.0, <2.4.0"
typing-extensions = ">=4.9.0, <4.13.0"
scikit-learn = ">=1.6.0, <1.7.0"
pandas = { extras = ["gcp", "parquet"], version = ">=2.2.2, <2.3.0" }
skops = ">=0.11, <0.12"
google-cloud-secret-manager = ">=2.20.0, <2.22.0"
shap = ">=0.46.0, <0.47.0"
matplotlib = ">=3.7.3, <3.8.0"


[tool.poetry.group.docs.dependencies]
mkdocs = "^1.5.3"
mkdocstrings-python = "^1.8.0"
mkdocs = ">=1.5.3, <1.6.0"
mkdocstrings-python = ">=1.12.2, <1.13.0"
mkdocs-material = "*"
mkdocs-section-index = "^0.3.4"
mkdocs-git-revision-date-localized-plugin = "^1.2.2"
mkdocs-autolinks-plugin = "^0.7.1"
mkdocs-awesome-pages-plugin = "^2.9.2"
mkdocs-exclude = "^1.0.2"
mkdocs-git-committers-plugin-2 = "^2.2.3"
lxml = "^5.1.0"
pymdown-extensions = "^10.7"
mkdocs-section-index = ">=0.3.4, <0.4.0"
mkdocs-git-revision-date-localized-plugin = ">=1.2.2, <1.4.0"
mkdocs-autolinks-plugin = ">=0.7.1, <0.8.0"
mkdocs-awesome-pages-plugin = ">=2.9.2, <3.0.0"
mkdocs-exclude = ">=1.0.2, <1.1.0"
mkdocs-git-committers-plugin-2 = ">=2.2.3, <2.5.0"
lxml = ">=5.1.0, <5.4.0"
pymdown-extensions = ">=10.7, <10.13"


[tool.poetry.group.tests.dependencies]
pytest-cov = ">=4.1,<7.0"
pytest-sugar = ">=0.9.5,<1.1.0"
dbldatagen = ">=0.3.1,<0.5.0"
pyparsing = "^3.1.1"
pyparsing = ">=3.1.1, <3.3.0"
pytest = ">=7.4.4,<9.0.0"
pytest-xdist = "^3.5.0"
pytest-xdist = ">=3.5.0, <3.7.0"


[tool.poetry.group.dev.dependencies]
ipython = "^8.19.0"
ipykernel = "^6.28.0"
google-cloud-dataproc = "^5.8.0"
ipython = ">=8.19.0, <8.31.0"
ipykernel = ">=6.28.0, <6.30.0"
google-cloud-dataproc = ">=5.8.0, <5.16.0"
pydoclint = ">=0.3.8,<0.6.0"
prettier = "^0.0.7"
prettier = ">=0.0.7, <0.1.0"
deptry = ">=0.12,<0.21"
yamllint = "^1.33.0"
yamllint = ">=1.33.0, <1.36.0"
pre-commit = ">=4.0.0, <4.1.0"
mypy = ">=1.13, <1.14"
pep8-naming = ">=0.14.1, <0.15.0"
interrogate = ">=1.7.0, <1.8.0"
isort = ">=5.13.2, <5.14.0"
darglint = ">=1.8.1, <1.9.0"
ruff = ">=0.8.1, <0.9.0"

[tool.semantic_release]
logging_use_named_masks = true
Expand Down
10 changes: 7 additions & 3 deletions src/gentropy/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,20 +267,24 @@ class LocusToGeneConfig(StepConfig):
"geneCount500kb",
"proteinGeneCount500kb",
"credibleSetConfidence",
# "isProteinCoding",
]
)
hyperparameters: dict[str, Any] = field(
default_factory=lambda: {
"n_estimators": 100,
"max_depth": 5,
"loss": "log_loss",
"max_depth": 10,
"ccp_alpha": 0,
"learning_rate": 0.1,
"min_samples_leaf": 5,
"min_samples_split": 5,
"subsample": 1,
}
)
wandb_run_name: str | None = None
hf_hub_repo_id: str | None = "opentargets/locus_to_gene"
hf_model_commit_message: str | None = "chore: update model"
download_from_hub: bool = True
cross_validate: bool = True
_target_: str = "gentropy.l2g.LocusToGeneStep"


Expand Down
29 changes: 9 additions & 20 deletions src/gentropy/dataset/l2g_prediction.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,12 +129,13 @@ def to_disease_target_evidence(
)

def add_locus_to_gene_features(
self: L2GPrediction, feature_matrix: L2GFeatureMatrix
self: L2GPrediction, feature_matrix: L2GFeatureMatrix, features_list: list[str]
) -> L2GPrediction:
"""Add features to the L2G predictions.
"""Add features used to extract the L2G predictions.
Args:
feature_matrix (L2GFeatureMatrix): Feature matrix dataset
features_list (list[str]): List of features used in the model
Returns:
L2GPrediction: L2G predictions with additional features
Expand All @@ -143,38 +144,26 @@ def add_locus_to_gene_features(
if "locusToGeneFeatures" in self.df.columns:
self.df = self.df.drop("locusToGeneFeatures")

# Columns identifying a studyLocus/gene pair
prediction_id_columns = ["studyLocusId", "geneId"]

# L2G matrix columns to build the map:
columns_to_map = [
column
for column in feature_matrix._df.columns
if column not in prediction_id_columns
]

# Aggregating all features into a single map column:
aggregated_features = (
feature_matrix._df.withColumn(
"locusToGeneFeatures",
f.create_map(
*sum(
[
(f.lit(colname), f.col(colname))
for colname in columns_to_map
],
((f.lit(feature), f.col(feature)) for feature in features_list),
(),
)
),
)
# from the freshly created map, we filter out the null values
.withColumn(
"locusToGeneFeatures",
f.expr("map_filter(locusToGeneFeatures, (k, v) -> v is not null)"),
f.expr("map_filter(locusToGeneFeatures, (k, v) -> v != 0)"),
)
.drop(*columns_to_map)
.drop(*features_list)
)
return L2GPrediction(
_df=self.df.join(aggregated_features, on=prediction_id_columns, how="left"),
_df=self.df.join(
aggregated_features, on=["studyLocusId", "geneId"], how="left"
),
_schema=self.get_schema(),
)
25 changes: 15 additions & 10 deletions src/gentropy/l2g.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

import pyspark.sql.functions as f
from sklearn.ensemble import GradientBoostingClassifier
from wandb import login as wandb_login
from wandb.sdk.wandb_login import login as wandb_login

from gentropy.common.schemas import compare_struct_schemas
from gentropy.common.session import Session
Expand Down Expand Up @@ -100,11 +100,12 @@ class LocusToGeneStep:
def __init__(
self,
session: Session,
hyperparameters: dict[str, Any],
*,
run_mode: str,
features_list: list[str],
hyperparameters: dict[str, Any],
download_from_hub: bool,
cross_validate: bool,
wandb_run_name: str,
credible_set_path: str,
feature_matrix_path: str,
Expand All @@ -113,18 +114,19 @@ def __init__(
variant_index_path: str | None = None,
gene_interactions_path: str | None = None,
predictions_path: str | None = None,
l2g_threshold: float | None,
hf_hub_repo_id: str | None,
l2g_threshold: float | None = None,
hf_hub_repo_id: str | None = None,
hf_model_commit_message: str | None = "chore: update model",
) -> None:
"""Initialise the step and run the logic based on mode.
Args:
session (Session): Session object that contains the Spark session
hyperparameters (dict[str, Any]): Hyperparameters for the model
run_mode (str): Run mode, either 'train' or 'predict'
features_list (list[str]): List of features to use for the model
hyperparameters (dict[str, Any]): Hyperparameters for the model
download_from_hub (bool): Whether to download the model from Hugging Face Hub
cross_validate (bool): Whether to run cross validation (5-fold by default) to train the model.
wandb_run_name (str): Name of the run to track model training in Weights and Biases
credible_set_path (str): Path to the credible set dataset necessary to build the feature matrix
feature_matrix_path (str): Path to the L2G feature matrix input dataset
Expand Down Expand Up @@ -152,6 +154,7 @@ def __init__(
self.features_list = list(features_list)
self.hyperparameters = dict(hyperparameters)
self.wandb_run_name = wandb_run_name
self.cross_validate = cross_validate
self.hf_hub_repo_id = hf_hub_repo_id
self.download_from_hub = download_from_hub
self.hf_model_commit_message = hf_model_commit_message
Expand Down Expand Up @@ -285,9 +288,11 @@ def run_predict(self) -> None:
)
predictions.filter(
f.col("score") >= self.l2g_threshold
).add_locus_to_gene_features(self.feature_matrix).df.coalesce(
self.session.output_partitions
).write.mode(self.session.write_mode).parquet(self.predictions_path)
).add_locus_to_gene_features(
self.feature_matrix, self.features_list
).df.coalesce(self.session.output_partitions).write.mode(
self.session.write_mode
).parquet(self.predictions_path)
self.session.logger.info("L2G predictions saved successfully.")

def run_train(self) -> None:
Expand All @@ -298,7 +303,7 @@ def run_train(self) -> None:

# Instantiate classifier and train model
l2g_model = LocusToGeneModel(
model=GradientBoostingClassifier(random_state=42),
model=GradientBoostingClassifier(random_state=42, loss="log_loss"),
hyperparameters=self.hyperparameters,
)

Expand All @@ -308,7 +313,7 @@ def run_train(self) -> None:
# Run the training
trained_model = LocusToGeneTrainer(
model=l2g_model, feature_matrix=feature_matrix
).train(self.wandb_run_name)
).train(self.wandb_run_name, cross_validate=self.cross_validate)

# Export the model
if trained_model.training_data and trained_model.model and self.model_path:
Expand Down
15 changes: 12 additions & 3 deletions src/gentropy/method/l2g/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,17 @@ class LocusToGeneModel:
"""Wrapper for the Locus to Gene classifier."""

model: Any = GradientBoostingClassifier(random_state=42)
hyperparameters: dict[str, Any] | None = None
hyperparameters: dict[str, Any] = field(
default_factory=lambda: {
"n_estimators": 100,
"max_depth": 10,
"ccp_alpha": 0,
"learning_rate": 0.1,
"min_samples_leaf": 5,
"min_samples_split": 5,
"subsample": 1,
}
)
training_data: L2GFeatureMatrix | None = None
label_encoder: dict[str, int] = field(
default_factory=lambda: {
Expand All @@ -38,8 +48,7 @@ class LocusToGeneModel:

def __post_init__(self: LocusToGeneModel) -> None:
"""Post-initialisation to fit the estimator with the provided params."""
if self.hyperparameters:
self.model.set_params(**self.hyperparameters_dict)
self.model.set_params(**self.hyperparameters_dict)

@classmethod
def load_from_disk(cls: Type[LocusToGeneModel], path: str) -> LocusToGeneModel:
Expand Down
Loading

0 comments on commit d179eb4

Please sign in to comment.