-
Notifications
You must be signed in to change notification settings - Fork 9
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: implement UKB PPP (EUR) ingestion & harmonisation (#652)
* feat: implement UKB PPP (EUR) ingestion & harmonisation * fix: correct module name for docs * fix: definitely correct module name for docs * test: update output of neglog_pvalue_to_mantissa_and_exponent * fix: test syntax with <BLANKLINE> * Update src/gentropy/datasource/ukb_ppp_eur/summary_stats.py Co-authored-by: Szymon Szyszkowski <[email protected]> * fix: code review updates for docs and version * fix: syntax for concat_ws * style: list harmonisation steps in the docstring * style: rename freq to MAF * style: use concat_ws * style: use two distinct parameters for study index and summary stats output paths --------- Co-authored-by: Szymon Szyszkowski <[email protected]>
- Loading branch information
1 parent
0d9160f
commit 8ff4bfc
Showing
15 changed files
with
477 additions
and
18 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
defaults: | ||
- ukb_ppp_eur_sumstat_preprocess | ||
|
||
raw_study_index_path: ??? | ||
raw_summary_stats_path: ??? | ||
variant_annotation_path: ??? | ||
tmp_variant_annotation_path: ??? | ||
study_index_output_path: ??? | ||
summary_stats_output_path: ??? | ||
|
||
session: | ||
extended_spark_conf: | ||
"spark.sql.shuffle.partitions": "3200" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
--- | ||
title: UKB-PPP (EUR) | ||
--- | ||
|
||
The UKB-PPP is a collaboration between the UK Biobank (UKB) and thirteen biopharmaceutical companies characterising the plasma proteomic profiles of 54,219 UKB participants. | ||
|
||
The original data is available at https://www.synapse.org/#!Synapse:syn51364943/. The associated paper is https://www.nature.com/articles/s41586-023-06592-6. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
--- | ||
title: Study Index | ||
--- | ||
|
||
::: gentropy.datasource.ukb_ppp_eur.study_index.UkbPppEurStudyIndex |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
--- | ||
title: Summary Statistics | ||
--- | ||
|
||
::: gentropy.datasource.ukb_ppp_eur.summary_stats.UkbPppEurSummaryStats |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
--- | ||
title: ukb_ppp_eur_sumstat_preprocess | ||
--- | ||
|
||
::: gentropy.ukb_ppp_eur_sumstat_preprocess.UkbPppEurStep |
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
"""Airflow DAG to ingest and harmonise UKB PPP (EUR) data.""" | ||
|
||
from __future__ import annotations | ||
|
||
from pathlib import Path | ||
|
||
import common_airflow as common | ||
from airflow.models.dag import DAG | ||
|
||
CLUSTER_NAME = "otg-ukb-ppp-eur" | ||
|
||
# Input location. | ||
UKB_PPP_EUR_STUDY_INDEX = "gs://gentropy-tmp/batch/output/ukb_ppp_eur/study_index.tsv" | ||
UKB_PPP_EUR_SUMMARY_STATS = "gs://gentropy-tmp/batch/output/ukb_ppp_eur/summary_stats.parquet" | ||
VARIANT_ANNOTATION = "gs://genetics_etl_python_playground/output/python_etl/parquet/XX.XX/variant_annotation" | ||
|
||
# Output locations. | ||
TMP_VARIANT_ANNOTATION = "gs://gentropy-tmp/variant_annotation" | ||
UKB_PPP_EUR_OUTPUT_STUDY_INDEX = "gs://ukb_ppp_eur_data/study_index" | ||
UKB_PPP_EUR_OUTPUT_SUMMARY_STATS = "gs://ukb_ppp_eur_data/summary_stats" | ||
|
||
with DAG( | ||
dag_id=Path(__file__).stem, | ||
description="Open Targets Genetics — Ingest UKB PPP (EUR)", | ||
default_args=common.shared_dag_args, | ||
**common.shared_dag_kwargs, | ||
): | ||
dag = common.generate_dag( | ||
cluster_name=CLUSTER_NAME, | ||
tasks=[ | ||
common.submit_step( | ||
cluster_name=CLUSTER_NAME, | ||
step_id="ot_ukb_ppp_eur_sumstat_preprocess", | ||
other_args=[ | ||
f"step.raw_study_index_path={UKB_PPP_EUR_STUDY_INDEX}", | ||
f"step.raw_summary_stats_path={UKB_PPP_EUR_SUMMARY_STATS}", | ||
f"step.variant_annotation_path={VARIANT_ANNOTATION}", | ||
f"step.tmp_variant_annotation_path={TMP_VARIANT_ANNOTATION}", | ||
f"step.study_index_output_path={UKB_PPP_EUR_OUTPUT_STUDY_INDEX}", | ||
f"step.summary_stats_output_path={UKB_PPP_EUR_OUTPUT_SUMMARY_STATS}", | ||
] | ||
) | ||
] | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
"""UKB PPP (EUR) data source.""" | ||
|
||
from __future__ import annotations |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
"""Study Index for Finngen data source.""" | ||
from __future__ import annotations | ||
|
||
import pyspark.sql.functions as f | ||
from pyspark.sql import SparkSession | ||
|
||
from gentropy.dataset.study_index import StudyIndex | ||
|
||
|
||
class UkbPppEurStudyIndex(StudyIndex): | ||
"""Study index dataset from UKB PPP (EUR).""" | ||
|
||
@classmethod | ||
def from_source( | ||
cls: type[UkbPppEurStudyIndex], | ||
spark: SparkSession, | ||
raw_study_index_path: str, | ||
raw_summary_stats_path: str, | ||
) -> StudyIndex: | ||
"""This function ingests study level metadata from UKB PPP (EUR). | ||
Args: | ||
spark (SparkSession): Spark session object. | ||
raw_study_index_path (str): Raw study index path. | ||
raw_summary_stats_path (str): Raw summary stats path. | ||
Returns: | ||
StudyIndex: Parsed and annotated UKB PPP (EUR) study table. | ||
""" | ||
# In order to populate the nSamples column, we need to peek inside the summary stats dataframe. | ||
num_of_samples = ( | ||
spark | ||
.read | ||
.parquet(raw_summary_stats_path) | ||
.filter(f.col("chromosome") == "22") | ||
.groupBy("studyId") | ||
.agg(f.first("N").cast("integer").alias("nSamples")) | ||
.select("*") | ||
) | ||
# Now we can read the raw study index and complete the processing. | ||
study_index_df = ( | ||
spark.read.csv(raw_study_index_path, sep="\t", header=True) | ||
.select( | ||
f.lit("pqtl").alias("studyType"), | ||
f.lit("UKB_PPP_EUR").alias("projectId"), | ||
f.col("_gentropy_study_id").alias("studyId"), | ||
f.col("UKBPPP_ProteinID").alias("traitFromSource"), | ||
f.lit("UBERON_0001969").alias("tissueFromSourceId"), | ||
f.col("ensembl_id").alias("geneId"), | ||
f.lit(True).alias("hasSumstats"), | ||
f.col("_gentropy_summary_stats_link").alias("summarystatsLocation"), | ||
) | ||
.join(num_of_samples, "studyId", "inner") | ||
) | ||
# Add population structure. | ||
study_index_df = ( | ||
study_index_df | ||
.withColumn( | ||
"discoverySamples", | ||
f.array( | ||
f.struct( | ||
f.col("nSamples").cast("integer").alias("sampleSize"), | ||
f.lit("European").alias("ancestry"), | ||
) | ||
) | ||
) | ||
.withColumn( | ||
"ldPopulationStructure", | ||
cls.aggregate_and_map_ancestries(f.col("discoverySamples")), | ||
) | ||
) | ||
|
||
return StudyIndex( | ||
_df=study_index_df, | ||
_schema=StudyIndex.get_schema(), | ||
) |
Oops, something went wrong.