opentargets · d0choa · Feb 22, 2024 · Feb 8, 2024 · Feb 8, 2024 · Feb 8, 2024
diff --git a/.github/workflows/pr_release_trigger.yaml b/.github/workflows/pr_release_trigger.yaml
@@ -0,0 +1,22 @@
+name: PR to trigger release
+
+"on":
+  schedule:
+    - cron: "15 16 * * 4"
+
+jobs:
+  pull-request:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - name: pull-request
+        uses: diillson/[email protected]
+        with:
+          source_branch: "dev"
+          destination_branch: "main"
+          pr_title: "chore: trigger release process"
+          pr_body: ":warning: *This PR requires a MERGE or REBASE COMMIT (Don't squash!)*"
+          pr_label: "auto-pr"
+          pr_draft: false
+          pr_allow_empty: true
+          github_token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,6 +1,9 @@
+ci:
+  autoupdate_commit_msg: "chore: pre-commit autoupdate"
+  autofix_commit_msg: "chore: pre-commit auto fixes [...]"
 repos:
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.1.13
+    rev: v0.2.2
     hooks:
       - id: ruff
         args:
@@ -13,6 +16,7 @@ repos:
     hooks:
       - id: trailing-whitespace
       - id: end-of-file-fixer
+        exclude: "CHANGELOG.md"
       - id: debug-statements
       - id: check-merge-conflict
       - id: check-case-conflict
@@ -28,7 +32,7 @@ repos:
       - id: debug-statements
       - id: check-docstring-first
   - repo: https://github.com/adrienverge/yamllint.git
-    rev: v1.33.0
+    rev: v1.35.1
     hooks:
       - id: yamllint
 
@@ -49,6 +53,7 @@ repos:
     rev: "v4.0.0-alpha.8"
     hooks:
       - id: prettier
+        exclude: "CHANGELOG.md"
 
   - repo: https://github.com/alessandrojcm/commitlint-pre-commit-hook
     rev: v9.11.0
@@ -91,6 +96,6 @@ repos:
       - id: beautysh
 
   - repo: https://github.com/jsh9/pydoclint
-    rev: 0.3.8
+    rev: 0.4.1
     hooks:
       - id: pydoclint
diff --git a/config/step/ot_finngen_finemapping_ingestion.yaml b/config/step/ot_finngen_finemapping_ingestion.yaml
@@ -3,4 +3,5 @@ defaults:
 
 finngen_finemapping_results_path: ${datasets.finngen_finemapping_results_path}
 finngen_finemapping_summaries_path: ${datasets.finngen_finemapping_summaries_path}
+finngen_release_prefix: ${datasets.finngen_release_prefix}
 finngen_finemapping_out: ${datasets.finngen_finemapping_out}
diff --git a/docs/development/contributing.md b/docs/development/contributing.md
@@ -82,5 +82,5 @@ For more details on each of these steps, see the sections below.
 ### Tests
 
 - Test study fixture in `tests/conftest.py` (example: `mock_study_index_finngen` in that module)
-- Test sample data in `tests/data_samples` (example: `tests/data_samples/finngen_studies_sample.json`)
+- Test sample data in `tests/data_samples` (example: `tests/gentropy/data_samples/finngen_studies_sample.json`)
 - Test definition in `tests/` (example: `tests/dataset/test_study_index.py` → `test_study_index_finngen_creation`)
diff --git a/docs/python_api/method/susie_inf.md → docs/python_api/methods/susie_inf.md b/docs/python_api/method/susie_inf.md → docs/python_api/methods/susie_inf.md
@@ -1,7 +1,9 @@
 ---
-title: SuSiE-inf - Fine-mapping with infinitesimal effects v1.1
+title: SuSiE-inf
 ---
 
+# SuSiE-inf - Fine-mapping with infinitesimal effects v1.1
+
 This is an implementation of the SuSiE-inf method found here:
 https://github.com/FinucaneLab/fine-mapping-inf
 https://www.nature.com/articles/s41588-023-01597-3

diff --git a/docs/src_snippets/howto/python_api/b_create_dataset.py b/docs/src_snippets/howto/python_api/b_create_dataset.py
@@ -17,7 +17,7 @@ def create_from_parquet(session: Session) -> SummaryStatistics:
 
     # --8<-- [end:create_from_parquet_import]
 
-    path = "tests/data_samples/sumstats_sample/GCST005523_chr18.parquet"
+    path = "tests/gentropy/data_samples/sumstats_sample/GCST005523_chr18.parquet"
     # --8<-- [start:create_from_parquet]
     summary_stats = SummaryStatistics.from_parquet(session, path)
     # --8<-- [end:create_from_parquet]
@@ -31,7 +31,7 @@ def create_from_source(session: Session) -> SummaryStatistics:
     from gentropy.datasource.finngen.summary_stats import FinnGenSummaryStats
 
     # --8<-- [end:create_from_source_import]
-    path = "tests/data_samples/finngen_R9_AB1_ACTINOMYCOSIS.gz"
+    path = "tests/gentropy/data_samples/finngen_R9_AB1_ACTINOMYCOSIS.gz"
     # --8<-- [start:create_from_source]
     summary_stats = FinnGenSummaryStats.from_source(session.spark, path)
     # --8<-- [end:create_from_source]
@@ -46,7 +46,7 @@ def create_from_pandas() -> SummaryStatistics:
 
     # --8<-- [end:create_from_pandas_import]
 
-    path = "tests/data_samples/sumstats_sample/GCST005523_chr18.parquet"
+    path = "tests/gentropy/data_samples/sumstats_sample/GCST005523_chr18.parquet"
     custom_summary_stats_pandas_df = ps.read_parquet(path)
     # --8<-- [start:create_from_pandas]
 

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -62,7 +62,7 @@ pytest-cov = "^4.1.0"
 pytest-sugar = ">=0.9.5,<1.1.0"
 dbldatagen = "^0.3.1"
 pyparsing = "^3.1.1"
-pytest = "^7.4.4"
+pytest = ">=7.4.4,<9.0.0"
 pytest-xdist = "^3.5.0"
 
 
@@ -72,10 +72,10 @@ ipykernel = "^6.28.0"
 google-cloud-dataproc = "^5.8.0"
 apache-airflow = "^2.8.0"
 apache-airflow-providers-google = "^10.13.1"
-pydoclint = "^0.3.8"
+pydoclint = ">=0.3.8,<0.5.0"
 prettier = "^0.0.7"
 deptry = "^0.12.0"
-python-semantic-release = "^8.7.0"
+python-semantic-release = ">=8.7,<10.0"
 yamllint = "^1.33.0"
 
 [tool.semantic_release]
@@ -127,6 +127,7 @@ exclude = ["dist"]
 [tool.pytest.ini_options]
 addopts = "-n auto --doctest-modules --cov=src/ --cov-report=xml"
 pythonpath = [".", "./src/airflow/dags"]
+testpaths = ["tests/gentropy", "src/gentropy/"]
 
 # Semi-strict mode for mypy
 [tool.mypy]

diff --git a/src/airflow/dags/finngen_preprocess.py b/src/airflow/dags/finngen_preprocess.py
@@ -21,6 +21,13 @@
 LD_CLUMPED = f"{FINNGEN_BUCKET}/study_locus_datasets/finngen_ld_clumped"
 PICSED_CREDIBLE_SET = f"{FINNGEN_BUCKET}/credible_set_datasets/finngen_pics"
 
+FINNGEN_FINEMAPPING = (
+    "gs://genetics_etl_python_playground/input/Finngen_susie_finemapping_r10/full"
+)
+FINNGEN_FM_SUMMARIES = "gs://genetics_etl_python_playground/input/Finngen_susie_finemapping_r10/Finngen_susie_credset_summary_r10.tsv"
+FINNGEN_PREFIX = "FINNGEN_R10_"
+FINNGEN_FM_OUT = "gs://genetics_etl_python_playground/output/python_etl/parquet/XX.XX/finngen_susie_processed"
+
 with DAG(
     dag_id=Path(__file__).stem,
     description="Open Targets Genetics — Finngen preprocess",
@@ -31,6 +38,12 @@
         cluster_name=CLUSTER_NAME,
         step_id="ot_finngen_finemapping_ingestion",
         task_id="finngen_finemapping_ingestion",
+        other_args=[
+            f"step.finngen_finemapping_out={FINNGEN_FM_OUT}",
+            f"step.finngen_release_prefix={FINNGEN_PREFIX}",
+            f"step.finngen_finemapping_results_path={FINNGEN_FINEMAPPING}",
+            f"step.finngen_finemapping_summaries_path={FINNGEN_FM_SUMMARIES}",
+        ],
         # This allows to attempt running the task when above step fails do to failifexists
         trigger_rule=TriggerRule.ALL_DONE,
     )

diff --git a/src/gentropy/config.py b/src/gentropy/config.py
@@ -133,6 +133,19 @@ class FinngenSumstatPreprocessConfig(StepConfig):
     _target_: str = "gentropy.finngen_sumstat_preprocess.FinnGenSumstatPreprocessStep"
 
 
+@dataclass
+class FinngenFinemappingConfig(StepConfig):
+    """FinnGen fine mapping ingestion step configuration."""
+
+    finngen_finemapping_results_path: str = MISSING
+    finngen_finemapping_summaries_path: str = MISSING
+    finngen_release_prefix: str = MISSING
+    finngen_finemapping_out: str = MISSING
+    _target_: str = (
+        "gentropy.finngen_finemapping_ingestion.FinnGenFinemappingIngestionStep"
+    )
+
+
 @dataclass
 class LDIndexConfig(StepConfig):
     """LD index step configuration."""
@@ -353,6 +366,13 @@ def register_config() -> None:
         name="finngen_sumstat_preprocess",
         node=FinngenSumstatPreprocessConfig,
     )
+
+    cs.store(
+        group="step",
+        name="finngen_finemapping_ingestion",
+        node=FinngenFinemappingConfig,
+    )
+
     cs.store(group="step", name="pics", node=PICSConfig)
     cs.store(group="step", name="variant_annotation", node=VariantAnnotationConfig)
     cs.store(group="step", name="variant_index", node=VariantIndexConfig)

diff --git a/src/gentropy/datasource/finngen/finemapping.py b/src/gentropy/datasource/finngen/finemapping.py
@@ -8,7 +8,7 @@
 import pyspark.sql.functions as f
 import pyspark.sql.types as t
 from pyspark.sql import SparkSession, Window
-from pyspark.sql.types import StringType, StructField, StructType
+from pyspark.sql.types import DoubleType, StringType, StructField, StructType
 
 from gentropy.common.spark_helpers import get_top_ranked_in_window
 from gentropy.common.utils import parse_pvalue
@@ -29,7 +29,6 @@ class FinnGenFinemapping:
     Finemapping method is populated as a constant ("SuSIE").
     """
 
-    finngen_release_prefix: str = "FINNGEN_R10"
     raw_schema: t.StructType = StructType(
         [
             StructField("trait", StringType(), True),
@@ -48,26 +47,56 @@ class FinnGenFinemapping:
             StructField("sd", StringType(), True),
             StructField("prob", StringType(), True),
             StructField("cs", StringType(), True),
-            StructField("alpha1", StringType(), True),
-            StructField("alpha2", StringType(), True),
-            StructField("alpha3", StringType(), True),
-            StructField("alpha4", StringType(), True),
-            StructField("alpha5", StringType(), True),
-            StructField("alpha6", StringType(), True),
-            StructField("alpha7", StringType(), True),
-            StructField("alpha8", StringType(), True),
-            StructField("alpha9", StringType(), True),
-            StructField("alpha10", StringType(), True),
-            StructField("lbf_variable1", StringType(), True),
-            StructField("lbf_variable2", StringType(), True),
-            StructField("lbf_variable3", StringType(), True),
-            StructField("lbf_variable4", StringType(), True),
-            StructField("lbf_variable5", StringType(), True),
-            StructField("lbf_variable6", StringType(), True),
-            StructField("lbf_variable7", StringType(), True),
-            StructField("lbf_variable8", StringType(), True),
-            StructField("lbf_variable9", StringType(), True),
-            StructField("lbf_variable10", StringType(), True),
+            StructField("cs_specific_prob", DoubleType(), True),
+            StructField("low_purity", StringType(), True),
+            StructField("lead_r2", StringType(), True),
+            StructField("mean_99", StringType(), True),
+            StructField("sd_99", StringType(), True),
+            StructField("prob_99", StringType(), True),
+            StructField("cs_99", StringType(), True),
+            StructField("cs_specific_prob_99", StringType(), True),
+            StructField("low_purity_99", StringType(), True),
+            StructField("lead_r2_99", StringType(), True),
+            StructField("alpha1", DoubleType(), True),
+            StructField("alpha2", DoubleType(), True),
+            StructField("alpha3", DoubleType(), True),
+            StructField("alpha4", DoubleType(), True),
+            StructField("alpha5", DoubleType(), True),
+            StructField("alpha6", DoubleType(), True),
+            StructField("alpha7", DoubleType(), True),
+            StructField("alpha8", DoubleType(), True),
+            StructField("alpha9", DoubleType(), True),
+            StructField("alpha10", DoubleType(), True),
+            StructField("mean1", StringType(), True),
+            StructField("mean2", StringType(), True),
+            StructField("mean3", StringType(), True),
+            StructField("mean4", StringType(), True),
+            StructField("mean5", StringType(), True),
+            StructField("mean6", StringType(), True),
+            StructField("mean7", StringType(), True),
+            StructField("mean8", StringType(), True),
+            StructField("mean9", StringType(), True),
+            StructField("mean10", StringType(), True),
+            StructField("sd1", StringType(), True),
+            StructField("sd2", StringType(), True),
+            StructField("sd3", StringType(), True),
+            StructField("sd4", StringType(), True),
+            StructField("sd5", StringType(), True),
+            StructField("sd6", StringType(), True),
+            StructField("sd7", StringType(), True),
+            StructField("sd8", StringType(), True),
+            StructField("sd9", StringType(), True),
+            StructField("sd10", StringType(), True),
+            StructField("lbf_variable1", DoubleType(), True),
+            StructField("lbf_variable2", DoubleType(), True),
+            StructField("lbf_variable3", DoubleType(), True),
+            StructField("lbf_variable4", DoubleType(), True),
+            StructField("lbf_variable5", DoubleType(), True),
+            StructField("lbf_variable6", DoubleType(), True),
+            StructField("lbf_variable7", DoubleType(), True),
+            StructField("lbf_variable8", DoubleType(), True),
+            StructField("lbf_variable9", DoubleType(), True),
+            StructField("lbf_variable10", DoubleType(), True),
         ]
     )
 
@@ -76,7 +105,7 @@ class FinnGenFinemapping:
             StructField("trait", StringType(), True),
             StructField("region", StringType(), True),
             StructField("cs", StringType(), True),
-            StructField("cs_log10bf", StringType(), True),
+            StructField("cs_log10bf", DoubleType(), True),
         ]
     )
 
@@ -86,6 +115,7 @@ def from_finngen_susie_finemapping(
         spark: SparkSession,
         finngen_finemapping_df: (str | list[str]),
         finngen_finemapping_summaries: (str | list[str]),
+        finngen_release_prefix: str,
         credset_lbf_threshold: float = 0.8685889638065036,
     ) -> StudyLocus:
         """Process the SuSIE finemapping output for FinnGen studies.
@@ -94,6 +124,7 @@ def from_finngen_susie_finemapping(
             spark (SparkSession): Spark session object.
             finngen_finemapping_df (str | list[str]): SuSIE finemapping output filename(s).
             finngen_finemapping_summaries (str | list[str]): filename of SuSIE finemapping summaries.
+            finngen_release_prefix (str): FinnGen study prefix.
             credset_lbf_threshold (float, optional): Filter out credible sets below, Default 0.8685889638065036 == np.log10(np.exp(2)), this is threshold from publication.
 
         Returns:
@@ -110,7 +141,7 @@ def from_finngen_susie_finemapping(
             .filter(f.col("cs").cast(t.IntegerType()) > 0)
             .select(
                 # Add study idenfitier.
-                f.concat(f.lit(cls.finngen_release_prefix), f.col("trait"))
+                f.concat(f.lit(finngen_release_prefix), f.col("trait"))
                 .cast(t.StringType())
                 .alias("studyId"),
                 f.col("region"),
@@ -209,7 +240,7 @@ def from_finngen_susie_finemapping(
                 | (f.col("credibleSetIndex") == 1)
             )
             .withColumn(
-                "studyId", f.concat(f.lit(cls.finngen_release_prefix), f.col("trait"))
+                "studyId", f.concat(f.lit(finngen_release_prefix), f.col("trait"))
             )
         )
 

diff --git a/src/gentropy/finngen_finemapping_ingestion.py b/src/gentropy/finngen_finemapping_ingestion.py
@@ -18,6 +18,7 @@ def __init__(
         session: Session,
         finngen_finemapping_results_path: str,
         finngen_finemapping_summaries_path: str,
+        finngen_release_prefix: str,
         finngen_finemapping_out: str,
     ) -> None:
         """Run FinnGen finemapping ingestion step.
@@ -26,14 +27,16 @@ def __init__(
             session (Session): Session object.
             finngen_finemapping_results_path (str): Path to the FinnGen SuSIE finemapping results.
             finngen_finemapping_summaries_path (str): FinnGen SuSIE summaries for CS filters(LBF>2).
+            finngen_release_prefix (str): Release prefix for FinnGen.
             finngen_finemapping_out (str): Output path for the finemapping results in StudyLocus format.
         """
-        # Read finemapping outputs from the URL.
+        # Read finemapping outputs from the input paths.
 
         finngen_finemapping_df = FinnGenFinemapping.from_finngen_susie_finemapping(
             spark=session.spark,
             finngen_finemapping_df=finngen_finemapping_results_path,
             finngen_finemapping_summaries=finngen_finemapping_summaries_path,
+            finngen_release_prefix=finngen_release_prefix,
         )
 
         # Write the output.

diff --git a/tests/__init__.py b/tests/__init__.py
@@ -1,3 +1 @@
-"""Unit tests."""
-
-from __future__ import annotations
+"""Gentropy tests package."""
diff --git a/tests/gentropy/__init__.py b/tests/gentropy/__init__.py
@@ -0,0 +1,3 @@
+"""Unit tests."""
+
+from __future__ import annotations
diff --git a/tests/common/test_session.py → tests/gentropy/common/test_session.py b/tests/common/test_session.py → tests/gentropy/common/test_session.py
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		"""Unit tests."""

		from __future__ import annotations