Merge branch 'dev' of https://github.com/opentargets/gentropy into xg…

…1_l2g_intervals
opentargets · Dec 10, 2024 · 1de5fcf · 1de5fcf
2 parents 55f947f + 79f6fcc
commit 1de5fcf
Showing 28 changed files with 502 additions and 274 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -6,7 +6,7 @@ ci:
   skip: [poetry-lock]
 repos:
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.7.3
+    rev: v0.7.4
     hooks:
       - id: ruff
         args:

diff --git a/Makefile b/Makefile
@@ -1,8 +1,15 @@
 PROJECT_ID ?= open-targets-genetics-dev
 REGION ?= europe-west1
 APP_NAME ?= $$(cat pyproject.toml | grep -m 1 "name" | cut -d" " -f3 | sed  's/"//g')
-REF ?= $$(git rev-parse --abbrev-ref HEAD)
 PACKAGE_VERSION ?= $$(poetry version --short)
+# NOTE: git rev-parse will always return the HEAD if it sits in the tag,
+# this way we can distinguish the tag vs branch name
+ifeq ($(shell git rev-parse --abbrev-ref HEAD),HEAD)
+	REF := $(shell git describe --exact-match --tags)
+else
+	REF := $(shell git rev-parse --abbrev-ref HEAD)
+endif
+
 CLEAN_PACKAGE_VERSION := $(shell echo "$(PACKAGE_VERSION)" | tr -cd '[:alnum:]')
 BUCKET_NAME=gs://genetics_etl_python_playground/initialisation/${APP_NAME}/${REF}
 

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -44,7 +44,7 @@ pep8-naming = "^0.14.1"
 interrogate = "^1.7.0"
 isort = "^5.13.2"
 darglint = "^1.8.1"
-ruff = "^0.7.0"
+ruff = "^0.8.1"
 
 [tool.poetry.group.docs.dependencies]
 mkdocs = "^1.5.3"

diff --git a/src/gentropy/assets/schemas/l2g_gold_standard.json b/src/gentropy/assets/schemas/l2g_gold_standard.json
@@ -25,6 +25,12 @@
       "nullable": false,
       "metadata": {}
     },
+    {
+      "name": "traitFromSourceMappedId",
+      "type": "string",
+      "nullable": true,
+      "metadata": {}
+    },
     {
       "name": "goldStandardSet",
       "type": "string",

diff --git a/src/gentropy/assets/schemas/vep_json_output.json b/src/gentropy/assets/schemas/vep_json_output.json
@@ -20,6 +20,12 @@
         "containsNull": true,
         "elementType": {
           "fields": [
+            {
+              "metadata": {},
+              "name": "conservation",
+              "nullable": true,
+              "type": "double"
+            },
             {
               "metadata": {},
               "name": "hgvsg",
@@ -294,6 +300,12 @@
         "containsNull": true,
         "elementType": {
           "fields": [
+            {
+              "metadata": {},
+              "name": "conservation",
+              "nullable": true,
+              "type": "double"
+            },
             {
               "metadata": {},
               "name": "alphamissense",

diff --git a/src/gentropy/biosample_index.py b/src/gentropy/biosample_index.py
@@ -1,4 +1,5 @@
 """Step to generate biosample index dataset."""
+
 from __future__ import annotations
 
 from gentropy.common.session import Session
@@ -28,10 +29,16 @@ def __init__(
             efo_input_path (str): Input efo dataset path.
             biosample_index_path (str): Output gene index dataset path.
         """
-        cell_ontology_index = extract_ontology_from_json(cell_ontology_input_path, session.spark)
+        cell_ontology_index = extract_ontology_from_json(
+            cell_ontology_input_path, session.spark
+        )
         uberon_index = extract_ontology_from_json(uberon_input_path, session.spark)
-        efo_index = extract_ontology_from_json(efo_input_path, session.spark).retain_rows_with_ancestor_id(["CL_0000000"])
+        efo_index = extract_ontology_from_json(
+            efo_input_path, session.spark
+        ).retain_rows_with_ancestor_id(["CL_0000000"])
 
         biosample_index = cell_ontology_index.merge_indices([uberon_index, efo_index])
 
-        biosample_index.df.write.mode(session.write_mode).parquet(biosample_index_path)
+        biosample_index.df.coalesce(session.output_partitions).write.mode(
+            session.write_mode
+        ).parquet(biosample_index_path)
diff --git a/src/gentropy/colocalisation.py b/src/gentropy/colocalisation.py
@@ -70,9 +70,9 @@ def __init__(
             coloc = partial(coloc, **colocalisation_method_params)
         colocalisation_results = coloc(overlaps)
         # Load
-        colocalisation_results.df.write.mode(session.write_mode).parquet(
-            f"{coloc_path}/{colocalisation_method.lower()}"
-        )
+        colocalisation_results.df.coalesce(session.output_partitions).write.mode(
+            session.write_mode
+        ).parquet(f"{coloc_path}/{colocalisation_method.lower()}")
 
     @classmethod
     def _get_colocalisation_class(

diff --git a/src/gentropy/common/session.py b/src/gentropy/common/session.py
@@ -24,6 +24,7 @@ def __init__(  # noqa: D107
         hail_home: str | None = None,
         start_hail: bool = False,
         extended_spark_conf: dict[str, str] | None = None,
+        output_partitions: int = 200,
     ) -> None:
         """Initialises spark session and logger.
 
@@ -34,6 +35,7 @@ def __init__(  # noqa: D107
             hail_home (str | None): Path to Hail installation. Defaults to None.
             start_hail (bool): Whether to start Hail. Defaults to False.
             extended_spark_conf (dict[str, str] | None): Extended Spark configuration. Defaults to None.
+            output_partitions (int): Number of partitions for output datasets. Defaults to 200.
         """
         merged_conf = self._create_merged_config(
             start_hail, hail_home, extended_spark_conf
@@ -53,6 +55,7 @@ def __init__(  # noqa: D107
         self.start_hail = start_hail
         if start_hail:
             hl.init(sc=self.spark.sparkContext, log="/dev/null")
+        self.output_partitions = output_partitions
 
     def _default_config(self: Session) -> SparkConf:
         """Default spark configuration.

diff --git a/src/gentropy/config.py b/src/gentropy/config.py
@@ -18,6 +18,7 @@ class SessionConfig:
     spark_uri: str = "local[*]"
     hail_home: str = os.path.dirname(hail_location)
     extended_spark_conf: dict[str, str] | None = field(default_factory=dict[str, str])
+    output_partitions: int = 200
     _target_: str = "gentropy.common.session.Session"
 
 
@@ -263,7 +264,6 @@ class LocusToGeneConfig(StepConfig):
             "geneCount500kb",
             "proteinGeneCount500kb",
             "credibleSetConfidence",
-            # "isProteinCoding",
             # intervals
             "pchicMean",
             "pchicMeanNeighbourhood",
@@ -276,19 +276,19 @@ class LocusToGeneConfig(StepConfig):
     hyperparameters: dict[str, Any] = field(
         default_factory=lambda: {
             "n_estimators": 100,
-            "max_depth": 5,
-            "loss": "log_loss",
+            "max_depth": 10,
+            "ccp_alpha": 0,
+            "learning_rate": 0.1,
+            "min_samples_leaf": 5,
+            "min_samples_split": 5,
+            "subsample": 1,
         }
     )
     wandb_run_name: str | None = None
     hf_hub_repo_id: str | None = "opentargets/locus_to_gene"
     hf_model_commit_message: str | None = "chore: update model"
     download_from_hub: bool = True
-    # interval_sources: dict[str, str] | None = {
-    #     "javierre": "gs://genetics_etl_python_playground/static_assets/javierre_2016_preprocessed",
-    #     "thurman": "gs://genetics_etl_python_playground/static_assets/thurman_2012/genomewideCorrs_above0.7_promoterPlusMinus500kb_withGeneNames_32celltypeCategories.bed8.gz",
-    #     "andersson": "gs://genetics_etl_python_playground/static_assets/andersson2014/enhancer_tss_associations.bed",
-    # }
+    cross_validate: bool = True
     _target_: str = "gentropy.l2g.LocusToGeneStep"
 
 
@@ -399,7 +399,9 @@ class GnomadVariantConfig(StepConfig):
         }
     )
     variant_annotation_path: str = MISSING
-    gnomad_genomes_path: str = "gs://gcp-public-data--gnomad/release/4.0/ht/genomes/gnomad.genomes.v4.0.sites.ht/"
+    gnomad_genomes_path: str = (
+        "gs://gcp-public-data--gnomad/release/4.1/ht/joint/gnomad.joint.v4.1.sites.ht/"
+    )
     gnomad_variant_populations: list[str] = field(
         default_factory=lambda: [
             "afr",  # African-American

diff --git a/src/gentropy/dataset/l2g_feature_matrix.py b/src/gentropy/dataset/l2g_feature_matrix.py
@@ -39,6 +39,8 @@ def __init__(
         self.fixed_cols = ["studyLocusId", "geneId"]
         if self.with_gold_standard:
             self.fixed_cols.append("goldStandardSet")
+        if "traitFromSourceMappedId" in _df.columns:
+            self.fixed_cols.append("traitFromSourceMappedId")
 
         self.features_list = features_list or [
             col for col in _df.columns if col not in self.fixed_cols

diff --git a/src/gentropy/dataset/l2g_prediction.py b/src/gentropy/dataset/l2g_prediction.py
@@ -129,12 +129,13 @@ def to_disease_target_evidence(
         )
 
     def add_locus_to_gene_features(
-        self: L2GPrediction, feature_matrix: L2GFeatureMatrix
+        self: L2GPrediction, feature_matrix: L2GFeatureMatrix, features_list: list[str]
     ) -> L2GPrediction:
-        """Add features to the L2G predictions.
+        """Add features used to extract the L2G predictions.
 
         Args:
             feature_matrix (L2GFeatureMatrix): Feature matrix dataset
+            features_list (list[str]): List of features used in the model
 
         Returns:
             L2GPrediction: L2G predictions with additional features
@@ -143,38 +144,26 @@ def add_locus_to_gene_features(
         if "locusToGeneFeatures" in self.df.columns:
             self.df = self.df.drop("locusToGeneFeatures")
 
-        # Columns identifying a studyLocus/gene pair
-        prediction_id_columns = ["studyLocusId", "geneId"]
-
-        # L2G matrix columns to build the map:
-        columns_to_map = [
-            column
-            for column in feature_matrix._df.columns
-            if column not in prediction_id_columns
-        ]
-
         # Aggregating all features into a single map column:
         aggregated_features = (
             feature_matrix._df.withColumn(
                 "locusToGeneFeatures",
                 f.create_map(
                     *sum(
-                        [
-                            (f.lit(colname), f.col(colname))
-                            for colname in columns_to_map
-                        ],
+                        ((f.lit(feature), f.col(feature)) for feature in features_list),
                         (),
                     )
                 ),
             )
-            # from the freshly created map, we filter out the null values
             .withColumn(
                 "locusToGeneFeatures",
-                f.expr("map_filter(locusToGeneFeatures, (k, v) -> v is not null)"),
+                f.expr("map_filter(locusToGeneFeatures, (k, v) -> v != 0)"),
             )
-            .drop(*columns_to_map)
+            .drop(*features_list)
         )
         return L2GPrediction(
-            _df=self.df.join(aggregated_features, on=prediction_id_columns, how="left"),
+            _df=self.df.join(
+                aggregated_features, on=["studyLocusId", "geneId"], how="left"
+            ),
             _schema=self.get_schema(),
         )
diff --git a/src/gentropy/dataset/study_locus.py b/src/gentropy/dataset/study_locus.py
@@ -113,7 +113,7 @@ class StudyLocusQualityCheck(Enum):
     EXPLAINED_BY_SUSIE = "Study locus in region explained by a SuSiE credible set"
     OUT_OF_SAMPLE_LD = "Study locus finemapped without in-sample LD reference"
     ABNORMAL_PIPS = (
-        "Study locus with a sum of PIPs that not in the expected range [0.99,1]"
+        "Study locus with a sum of PIPs that not in the expected range [0.95,1]"
     )
     INVALID_CHROMOSOME = "Chromosome not in 1:22, X, Y, XY or MT"
     TOP_HIT_AND_SUMMARY_STATS = (