Skip to content

Commit

Permalink
Merge branch 'dev' of https://github.com/opentargets/gentropy into xg…
Browse files Browse the repository at this point in the history
…1_l2g_intervals
xyg123 committed Dec 10, 2024
2 parents 55f947f + 79f6fcc commit 1de5fcf
Showing 28 changed files with 502 additions and 274 deletions.
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -6,7 +6,7 @@ ci:
skip: [poetry-lock]
repos:
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.7.3
rev: v0.7.4
hooks:
- id: ruff
args:
9 changes: 8 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -1,8 +1,15 @@
PROJECT_ID ?= open-targets-genetics-dev
REGION ?= europe-west1
APP_NAME ?= $$(cat pyproject.toml | grep -m 1 "name" | cut -d" " -f3 | sed 's/"//g')
REF ?= $$(git rev-parse --abbrev-ref HEAD)
PACKAGE_VERSION ?= $$(poetry version --short)
# NOTE: git rev-parse will always return the HEAD if it sits in the tag,
# this way we can distinguish the tag vs branch name
ifeq ($(shell git rev-parse --abbrev-ref HEAD),HEAD)
REF := $(shell git describe --exact-match --tags)
else
REF := $(shell git rev-parse --abbrev-ref HEAD)
endif

CLEAN_PACKAGE_VERSION := $(shell echo "$(PACKAGE_VERSION)" | tr -cd '[:alnum:]')
BUCKET_NAME=gs://genetics_etl_python_playground/initialisation/${APP_NAME}/${REF}

54 changes: 27 additions & 27 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -44,7 +44,7 @@ pep8-naming = "^0.14.1"
interrogate = "^1.7.0"
isort = "^5.13.2"
darglint = "^1.8.1"
ruff = "^0.7.0"
ruff = "^0.8.1"

[tool.poetry.group.docs.dependencies]
mkdocs = "^1.5.3"
6 changes: 6 additions & 0 deletions src/gentropy/assets/schemas/l2g_gold_standard.json
Original file line number Diff line number Diff line change
@@ -25,6 +25,12 @@
"nullable": false,
"metadata": {}
},
{
"name": "traitFromSourceMappedId",
"type": "string",
"nullable": true,
"metadata": {}
},
{
"name": "goldStandardSet",
"type": "string",
12 changes: 12 additions & 0 deletions src/gentropy/assets/schemas/vep_json_output.json
Original file line number Diff line number Diff line change
@@ -20,6 +20,12 @@
"containsNull": true,
"elementType": {
"fields": [
{
"metadata": {},
"name": "conservation",
"nullable": true,
"type": "double"
},
{
"metadata": {},
"name": "hgvsg",
@@ -294,6 +300,12 @@
"containsNull": true,
"elementType": {
"fields": [
{
"metadata": {},
"name": "conservation",
"nullable": true,
"type": "double"
},
{
"metadata": {},
"name": "alphamissense",
13 changes: 10 additions & 3 deletions src/gentropy/biosample_index.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Step to generate biosample index dataset."""

from __future__ import annotations

from gentropy.common.session import Session
@@ -28,10 +29,16 @@ def __init__(
efo_input_path (str): Input efo dataset path.
biosample_index_path (str): Output gene index dataset path.
"""
cell_ontology_index = extract_ontology_from_json(cell_ontology_input_path, session.spark)
cell_ontology_index = extract_ontology_from_json(
cell_ontology_input_path, session.spark
)
uberon_index = extract_ontology_from_json(uberon_input_path, session.spark)
efo_index = extract_ontology_from_json(efo_input_path, session.spark).retain_rows_with_ancestor_id(["CL_0000000"])
efo_index = extract_ontology_from_json(
efo_input_path, session.spark
).retain_rows_with_ancestor_id(["CL_0000000"])

biosample_index = cell_ontology_index.merge_indices([uberon_index, efo_index])

biosample_index.df.write.mode(session.write_mode).parquet(biosample_index_path)
biosample_index.df.coalesce(session.output_partitions).write.mode(
session.write_mode
).parquet(biosample_index_path)
6 changes: 3 additions & 3 deletions src/gentropy/colocalisation.py
Original file line number Diff line number Diff line change
@@ -70,9 +70,9 @@ def __init__(
coloc = partial(coloc, **colocalisation_method_params)
colocalisation_results = coloc(overlaps)
# Load
colocalisation_results.df.write.mode(session.write_mode).parquet(
f"{coloc_path}/{colocalisation_method.lower()}"
)
colocalisation_results.df.coalesce(session.output_partitions).write.mode(
session.write_mode
).parquet(f"{coloc_path}/{colocalisation_method.lower()}")

@classmethod
def _get_colocalisation_class(
3 changes: 3 additions & 0 deletions src/gentropy/common/session.py
Original file line number Diff line number Diff line change
@@ -24,6 +24,7 @@ def __init__( # noqa: D107
hail_home: str | None = None,
start_hail: bool = False,
extended_spark_conf: dict[str, str] | None = None,
output_partitions: int = 200,
) -> None:
"""Initialises spark session and logger.
@@ -34,6 +35,7 @@ def __init__( # noqa: D107
hail_home (str | None): Path to Hail installation. Defaults to None.
start_hail (bool): Whether to start Hail. Defaults to False.
extended_spark_conf (dict[str, str] | None): Extended Spark configuration. Defaults to None.
output_partitions (int): Number of partitions for output datasets. Defaults to 200.
"""
merged_conf = self._create_merged_config(
start_hail, hail_home, extended_spark_conf
@@ -53,6 +55,7 @@ def __init__( # noqa: D107
self.start_hail = start_hail
if start_hail:
hl.init(sc=self.spark.sparkContext, log="/dev/null")
self.output_partitions = output_partitions

def _default_config(self: Session) -> SparkConf:
"""Default spark configuration.
20 changes: 11 additions & 9 deletions src/gentropy/config.py
Original file line number Diff line number Diff line change
@@ -18,6 +18,7 @@ class SessionConfig:
spark_uri: str = "local[*]"
hail_home: str = os.path.dirname(hail_location)
extended_spark_conf: dict[str, str] | None = field(default_factory=dict[str, str])
output_partitions: int = 200
_target_: str = "gentropy.common.session.Session"


@@ -263,7 +264,6 @@ class LocusToGeneConfig(StepConfig):
"geneCount500kb",
"proteinGeneCount500kb",
"credibleSetConfidence",
# "isProteinCoding",
# intervals
"pchicMean",
"pchicMeanNeighbourhood",
@@ -276,19 +276,19 @@ class LocusToGeneConfig(StepConfig):
hyperparameters: dict[str, Any] = field(
default_factory=lambda: {
"n_estimators": 100,
"max_depth": 5,
"loss": "log_loss",
"max_depth": 10,
"ccp_alpha": 0,
"learning_rate": 0.1,
"min_samples_leaf": 5,
"min_samples_split": 5,
"subsample": 1,
}
)
wandb_run_name: str | None = None
hf_hub_repo_id: str | None = "opentargets/locus_to_gene"
hf_model_commit_message: str | None = "chore: update model"
download_from_hub: bool = True
# interval_sources: dict[str, str] | None = {
# "javierre": "gs://genetics_etl_python_playground/static_assets/javierre_2016_preprocessed",
# "thurman": "gs://genetics_etl_python_playground/static_assets/thurman_2012/genomewideCorrs_above0.7_promoterPlusMinus500kb_withGeneNames_32celltypeCategories.bed8.gz",
# "andersson": "gs://genetics_etl_python_playground/static_assets/andersson2014/enhancer_tss_associations.bed",
# }
cross_validate: bool = True
_target_: str = "gentropy.l2g.LocusToGeneStep"


@@ -399,7 +399,9 @@ class GnomadVariantConfig(StepConfig):
}
)
variant_annotation_path: str = MISSING
gnomad_genomes_path: str = "gs://gcp-public-data--gnomad/release/4.0/ht/genomes/gnomad.genomes.v4.0.sites.ht/"
gnomad_genomes_path: str = (
"gs://gcp-public-data--gnomad/release/4.1/ht/joint/gnomad.joint.v4.1.sites.ht/"
)
gnomad_variant_populations: list[str] = field(
default_factory=lambda: [
"afr", # African-American
2 changes: 2 additions & 0 deletions src/gentropy/dataset/l2g_feature_matrix.py
Original file line number Diff line number Diff line change
@@ -39,6 +39,8 @@ def __init__(
self.fixed_cols = ["studyLocusId", "geneId"]
if self.with_gold_standard:
self.fixed_cols.append("goldStandardSet")
if "traitFromSourceMappedId" in _df.columns:
self.fixed_cols.append("traitFromSourceMappedId")

self.features_list = features_list or [
col for col in _df.columns if col not in self.fixed_cols
29 changes: 9 additions & 20 deletions src/gentropy/dataset/l2g_prediction.py
Original file line number Diff line number Diff line change
@@ -129,12 +129,13 @@ def to_disease_target_evidence(
)

def add_locus_to_gene_features(
self: L2GPrediction, feature_matrix: L2GFeatureMatrix
self: L2GPrediction, feature_matrix: L2GFeatureMatrix, features_list: list[str]
) -> L2GPrediction:
"""Add features to the L2G predictions.
"""Add features used to extract the L2G predictions.
Args:
feature_matrix (L2GFeatureMatrix): Feature matrix dataset
features_list (list[str]): List of features used in the model
Returns:
L2GPrediction: L2G predictions with additional features
@@ -143,38 +144,26 @@ def add_locus_to_gene_features(
if "locusToGeneFeatures" in self.df.columns:
self.df = self.df.drop("locusToGeneFeatures")

# Columns identifying a studyLocus/gene pair
prediction_id_columns = ["studyLocusId", "geneId"]

# L2G matrix columns to build the map:
columns_to_map = [
column
for column in feature_matrix._df.columns
if column not in prediction_id_columns
]

# Aggregating all features into a single map column:
aggregated_features = (
feature_matrix._df.withColumn(
"locusToGeneFeatures",
f.create_map(
*sum(
[
(f.lit(colname), f.col(colname))
for colname in columns_to_map
],
((f.lit(feature), f.col(feature)) for feature in features_list),
(),
)
),
)
# from the freshly created map, we filter out the null values
.withColumn(
"locusToGeneFeatures",
f.expr("map_filter(locusToGeneFeatures, (k, v) -> v is not null)"),
f.expr("map_filter(locusToGeneFeatures, (k, v) -> v != 0)"),
)
.drop(*columns_to_map)
.drop(*features_list)
)
return L2GPrediction(
_df=self.df.join(aggregated_features, on=prediction_id_columns, how="left"),
_df=self.df.join(
aggregated_features, on=["studyLocusId", "geneId"], how="left"
),
_schema=self.get_schema(),
)
2 changes: 1 addition & 1 deletion src/gentropy/dataset/study_locus.py
Original file line number Diff line number Diff line change
@@ -113,7 +113,7 @@ class StudyLocusQualityCheck(Enum):
EXPLAINED_BY_SUSIE = "Study locus in region explained by a SuSiE credible set"
OUT_OF_SAMPLE_LD = "Study locus finemapped without in-sample LD reference"
ABNORMAL_PIPS = (
"Study locus with a sum of PIPs that not in the expected range [0.99,1]"
"Study locus with a sum of PIPs that not in the expected range [0.95,1]"
)
INVALID_CHROMOSOME = "Chromosome not in 1:22, X, Y, XY or MT"
TOP_HIT_AND_SUMMARY_STATS = (
Loading

0 comments on commit 1de5fcf

Please sign in to comment.