From 82e99bfe6238937c5ba71ead41fcd3c28457001c Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 18 Dec 2023 10:08:55 +0000 Subject: [PATCH 01/21] build(deps): bump pyspark from 3.3.3 to 3.3.4 (#358) Bumps [pyspark](https://github.com/apache/spark) from 3.3.3 to 3.3.4. - [Commits](https://github.com/apache/spark/compare/v3.3.3...v3.3.4) --- updated-dependencies: - dependency-name: pyspark dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- poetry.lock | 6 +++--- pyproject.toml | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/poetry.lock b/poetry.lock index 4ff44ba9a..21b0e56bc 100644 --- a/poetry.lock +++ b/poetry.lock @@ -6419,12 +6419,12 @@ diagrams = ["jinja2", "railroad-diagrams"] [[package]] name = "pyspark" -version = "3.3.3" +version = "3.3.4" description = "Apache Spark Python API" optional = false python-versions = ">=3.7" files = [ - {file = "pyspark-3.3.3.tar.gz", hash = "sha256:384d2ad7090cd1db5b2d2ac497bda409d86ab3a27272833e1a27efadf45e4d2f"}, + {file = "pyspark-3.3.4.tar.gz", hash = "sha256:1f866be47130a522355240949ed50d9812a8f327bd7619f043ffe07fbcf7f7b6"}, ] [package.dependencies] @@ -8462,4 +8462,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p [metadata] lock-version = "2.0" python-versions = "3.10.8" -content-hash = "47cea83286b0dac00a818ebc5e5ce81fbc5605e768045bdc3b4805f32fda2884" +content-hash = "61c92b5469c17469a2109c72da3789ef57dee5949201c36c0270aec674c8e079" diff --git a/pyproject.toml b/pyproject.toml index 943b559d6..507ed180d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,7 +19,7 @@ otg = "otg.cli:main" [tool.poetry.dependencies] python = "3.10.8" -pyspark = "3.3.3" +pyspark = "3.3.4" scipy = "^1.11.4" hydra-core = "^1.3.2" pyliftover = "^0.4" From d95142943b35c396d13f1644a582ce004ae19dcf Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 18 Dec 2023 11:07:33 +0000 Subject: [PATCH 02/21] chore(deps): bump python-semantic-release/python-semantic-release (#359) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [python-semantic-release/python-semantic-release](https://github.com/python-semantic-release/python-semantic-release) from 8.3.0 to 8.5.1. - [Release notes](https://github.com/python-semantic-release/python-semantic-release/releases) - [Changelog](https://github.com/python-semantic-release/python-semantic-release/blob/master/CHANGELOG.md) - [Commits](https://github.com/python-semantic-release/python-semantic-release/compare/v8.3.0...v8.5.1) --- updated-dependencies: - dependency-name: python-semantic-release/python-semantic-release dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Irene Lรณpez <45119610+ireneisdoomed@users.noreply.github.com> --- .github/workflows/release.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index af4dc5f38..8f5cf3c3e 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -58,7 +58,7 @@ jobs: - name: Python Semantic Release id: release - uses: python-semantic-release/python-semantic-release@v8.3.0 + uses: python-semantic-release/python-semantic-release@v8.5.1 with: github_token: ${{ secrets.GITHUB_TOKEN }} From 33652b2e37656ec7403ac71a8bfbdf3dab7ac7c9 Mon Sep 17 00:00:00 2001 From: David Ochoa Date: Mon, 18 Dec 2023 14:11:41 +0000 Subject: [PATCH 03/21] ci: new changelog and release notes templates (#357) Templates for CHANGELOG and release notes. To be fully tested on the next release. --- templates/.release_notes.md.j2 | 11 +++++++++++ templates/CHANGELOG.md.j2 | 27 +++++++++++++++++++++++++++ 2 files changed, 38 insertions(+) create mode 100644 templates/.release_notes.md.j2 create mode 100644 templates/CHANGELOG.md.j2 diff --git a/templates/.release_notes.md.j2 b/templates/.release_notes.md.j2 new file mode 100644 index 000000000..81af53590 --- /dev/null +++ b/templates/.release_notes.md.j2 @@ -0,0 +1,11 @@ +{% macro add_emoji(commit_type) %}{% if commit_type == "feature" %}โœจ{% elif commit_type == "fix" %}๐Ÿ›{% elif commit_type == "documentation" %}๐Ÿ“–{% elif commit_type == "style" %}๐ŸŽจ{% elif commit_type == "refactor" %}โ™ป๏ธ{% elif commit_type == "test" %}โœ…{% elif commit_type == "chore" %}๐Ÿš€{% elif commit_type == "performance" %}โšก๏ธ{% elif commit_type == "ci" %}๐Ÿ‘ทโ€โ™‚๏ธ{% elif commit_type == "build" %}๐Ÿ—{% elif commit_type == "breaking" %}๐Ÿ’ฅ{% elif commit_type == "unknown" %}๐Ÿคทโ€โ™‚๏ธ{% elif commit_type == "revert" %}โช{% else %}๐Ÿคทโ€โ™‚๏ธ{% endif %}{% endmacro %} + +{% macro commit_scope(commit_summary) %}{{ commit_summary.split(":")[0] }}{% endmacro %} +{% macro commit_content(commit_summary) %}{{ commit_summary.split(":")[1] }}{% endmacro %} +## What's Changed +{% for type_, commits in release["elements"] | dictsort %} +### {{ add_emoji(type_) }} {{ type_ | capitalize }} +{%- if type_ != "unknown" %} +{% for commit in commits %} +- {{ commit_content(commit.commit.summary) }} - [`{{ commit.commit.hexsha[:7] }}`]({{ commit.commit.hexsha | commit_hash_url }}) ([{{ commit.commit.author.name }}](mailto:{{commit.commit.author.email}})) +{%- endfor %}{% endif %}{% endfor %} diff --git a/templates/CHANGELOG.md.j2 b/templates/CHANGELOG.md.j2 new file mode 100644 index 000000000..c61ea7c09 --- /dev/null +++ b/templates/CHANGELOG.md.j2 @@ -0,0 +1,27 @@ +{% macro add_emoji(commit_type) %}{% if commit_type == "feature" %}โœจ{% elif commit_type == "fix" %}๐Ÿ›{% elif commit_type == "documentation" %}๐Ÿ“–{% elif commit_type == "style" %}๐ŸŽจ{% elif commit_type == "refactor" %}โ™ป๏ธ{% elif commit_type == "test" %}โœ…{% elif commit_type == "chore" %}๐Ÿš€{% elif commit_type == "performance" %}โšก๏ธ{% elif commit_type == "ci" %}๐Ÿ‘ทโ€โ™‚๏ธ{% elif commit_type == "build" %}๐Ÿ—{% elif commit_type == "breaking" %}๐Ÿ’ฅ{% elif commit_type == "unknown" %}๐Ÿคทโ€โ™‚๏ธ{% elif commit_type == "revert" %}โช{% else %}๐Ÿคทโ€โ™‚๏ธ{% endif %}{% endmacro %} + +{% macro commit_scope(commit_summary) %}{{ commit_summary.split(":")[0] }}{% endmacro %} +{% macro commit_content(commit_summary) %}{{ commit_summary.split(":")[1] }}{% endmacro %} +# CHANGELOG +{% if context.history.unreleased | length > 0 %} + +{# UNRELEASED #} +## Unreleased +{% for type_, commits in context.history.unreleased | dictsort %} +### add_emoji({{type_}}) {{ type_ | capitalize }} +{% for commit in commits %}{% if type_ != "unknown" %} +- {{ commit_content(commit.commit.summary) }} - [`{{ commit.commit.hexsha[:7] }}`]({{ commit.commit.hexsha | commit_hash_url }}) ([{{ commit.commit.author.name }}](mailto:{{commit.commit.author.email}})) +{% else %} +- {{ commit_content(commit.commit.summary) }} - [`{{ commit.commit.hexsha[:7] }}`]({{ commit.commit.hexsha | commit_hash_url }}) ([{{ commit.commit.author.name }}](mailto:{{commit.commit.author.email}})) +{% endif %}{% endfor %}{% endfor %}{% endif %} + +{# RELEASED #} +{% for version, release in context.history.released.items() %} +## {{ version.as_tag() }} ({{ release.tagged_date.strftime("%Y-%m-%d") }}) +{% for type_, commits in release["elements"] | dictsort %} +### {{ add_emoji(type_) }} {{ type_ | capitalize }} +{% for commit in commits %}{% if type_ != "unknown" %} +- {{ commit_content(commit.commit.summary) }} - [`{{ commit.commit.hexsha[:7] }}`]({{ commit.commit.hexsha | commit_hash_url }}) ([{{ commit.commit.author.name }}](mailto:{{commit.commit.author.email}})) +{% else %} +- {{ commit_content(commit.commit.summary) }} - [`{{ commit.commit.hexsha[:7] }}`]({{ commit.commit.hexsha | commit_hash_url }}) ([{{ commit.commit.author.name }}](mailto:{{commit.commit.author.email}})) +{% endif %}{% endfor %}{% endfor %}{% endfor %} From 8e8c403542ea091254dfa6b2d3da6fb0505cac55 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Irene=20L=C3=B3pez?= <45119610+ireneisdoomed@users.noreply.github.com> Date: Mon, 18 Dec 2023 17:23:24 +0000 Subject: [PATCH 04/21] fix(l2g): `calculate_feature_missingness_rate` counts features annotated with 0 as incomplete (#364) * chore: import wandb classes explicitly * fix: count feature annotation with 0 as incomplete * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/otg/dataset/l2g_feature_matrix.py | 7 ++++++- src/otg/method/l2g/evaluator.py | 7 +++---- src/otg/method/l2g/model.py | 7 ++++--- tests/dataset/test_l2g.py | 24 ++++++++++++++++++++++++ 4 files changed, 37 insertions(+), 8 deletions(-) diff --git a/src/otg/dataset/l2g_feature_matrix.py b/src/otg/dataset/l2g_feature_matrix.py index c966b87c7..32bd8d06d 100644 --- a/src/otg/dataset/l2g_feature_matrix.py +++ b/src/otg/dataset/l2g_feature_matrix.py @@ -109,7 +109,12 @@ def calculate_feature_missingness_rate( raise ValueError("No features found") return { - feature: (self._df.filter(self._df[feature].isNull()).count() / total_count) + feature: ( + self._df.filter( + (self._df[feature].isNull()) | (self._df[feature] == 0) + ).count() + / total_count + ) for feature in self.features_list } diff --git a/src/otg/method/l2g/evaluator.py b/src/otg/method/l2g/evaluator.py index 527a48e5f..f41b1d45e 100644 --- a/src/otg/method/l2g/evaluator.py +++ b/src/otg/method/l2g/evaluator.py @@ -4,7 +4,6 @@ import itertools from typing import TYPE_CHECKING, Any, Dict -import wandb from pyspark import keyword_only from pyspark.ml.evaluation import ( BinaryClassificationEvaluator, @@ -12,10 +11,10 @@ MulticlassClassificationEvaluator, ) from pyspark.ml.param import Param, Params, TypeConverters +from wandb.sdk.wandb_run import Run if TYPE_CHECKING: from pyspark.sql import DataFrame - from wandb.wandb_run import Run class WandbEvaluator(Evaluator): @@ -124,11 +123,11 @@ def getspark_ml_evaluator(self: WandbEvaluator) -> Evaluator: """ return self.getOrDefault(self.spark_ml_evaluator) - def getwandb_run(self: WandbEvaluator) -> wandb.sdk.wandb_run.Run: + def getwandb_run(self: WandbEvaluator) -> Run: """Get the wandb_run parameter. Returns: - wandb.sdk.wandb_run.Run: Wandb run object. + Run: Wandb run object. """ return self.getOrDefault(self.wandb_run) diff --git a/src/otg/method/l2g/model.py b/src/otg/method/l2g/model.py index 61deb3066..2eedfc1f3 100644 --- a/src/otg/method/l2g/model.py +++ b/src/otg/method/l2g/model.py @@ -5,7 +5,6 @@ from dataclasses import dataclass from typing import TYPE_CHECKING, Any, Type -import wandb from pyspark.ml import Pipeline, PipelineModel from pyspark.ml.evaluation import ( BinaryClassificationEvaluator, @@ -13,6 +12,8 @@ ) from pyspark.ml.feature import StringIndexer, VectorAssembler from pyspark.ml.tuning import ParamGridBuilder +from wandb.data_types import Table +from wandb.sdk import init as wandb_init from wandb.wandb_run import Run from xgboost.spark.core import SparkXGBClassifierModel @@ -126,7 +127,7 @@ def log_to_wandb( ## Track feature importance wandb_run.log({"importances": self.get_feature_importance()}) ## Track training set - training_table = wandb.Table(dataframe=training_data.df.toPandas()) + training_table = Table(dataframe=training_data.df.toPandas()) wandb_run.log({"trainingSet": training_table}) # Count number of positive and negative labels gs_counts_dict = { @@ -224,7 +225,7 @@ def evaluate( ) if wandb_run_name and training_data: - run = wandb.init( + run = wandb_init( project=self.wandb_l2g_project_name, config=hyperparameters, name=wandb_run_name, diff --git a/tests/dataset/test_l2g.py b/tests/dataset/test_l2g.py index 3bf5d472c..c2aa21dcb 100644 --- a/tests/dataset/test_l2g.py +++ b/tests/dataset/test_l2g.py @@ -149,3 +149,27 @@ def test_remove_false_negatives(spark: SparkSession) -> None: ) assert observed_df.collect() == expected_df.collect() + + +def test_calculate_feature_missingness_rate(spark: SparkSession) -> None: + """Test L2GFeatureMatrix.calculate_feature_missingness_rate.""" + fm = L2GFeatureMatrix( + _df=spark.createDataFrame( + [ + (1, "gene1", 100.0, None), + (2, "gene2", 1000.0, 0.0), + ], + "studyLocusId LONG, geneId STRING, distanceTssMean DOUBLE, distanceTssMinimum DOUBLE", + ), + _schema=L2GFeatureMatrix.get_schema(), + ) + + expected_missingness = {"distanceTssMean": 0.0, "distanceTssMinimum": 1.0} + observed_missingness = fm.calculate_feature_missingness_rate() + assert isinstance(observed_missingness, dict) + assert len(observed_missingness) == len( + fm.features_list # type: ignore + ), "Missing features in the missingness rate dictionary." + assert ( + observed_missingness == expected_missingness + ), "Missingness rate is incorrect." From e3bcd722bbaffa78b815b12b2d4a8b1c08f106f4 Mon Sep 17 00:00:00 2001 From: Yakov Date: Wed, 20 Dec 2023 09:55:14 +0000 Subject: [PATCH 05/21] docs: corrected and added documentation to datasource (#362) * docs: corrected and added documentation to datasource * docs: corrected documentation to datasource - answering comments v1 * docs: corrections in datasource documentation --- docs/python_api/datasource/_datasource.md | 19 ++++++++++++++- .../eqtl_catalogue/_eqtl_catalogue.md | 11 +++++++++ .../python_api/datasource/finngen/_finngen.md | 4 +++- docs/python_api/datasource/gnomad/_gnomad.md | 6 +++++ .../datasource/gwas_catalog/_gwas_catalog.md | 14 +++++++++++ .../datasource/intervals/_intervals.md | 24 ++++++++++++++++--- .../datasource/open_targets/_open_targets.md | 10 ++++++-- .../datasource/ukbiobank/_ukbiobank.md | 24 ------------------- .../datasource/ukbiobank/study_index.md | 5 ---- 9 files changed, 81 insertions(+), 36 deletions(-) create mode 100644 docs/python_api/datasource/eqtl_catalogue/_eqtl_catalogue.md delete mode 100644 docs/python_api/datasource/ukbiobank/_ukbiobank.md delete mode 100644 docs/python_api/datasource/ukbiobank/study_index.md diff --git a/docs/python_api/datasource/_datasource.md b/docs/python_api/datasource/_datasource.md index e92a12f83..9fab444bf 100644 --- a/docs/python_api/datasource/_datasource.md +++ b/docs/python_api/datasource/_datasource.md @@ -4,4 +4,21 @@ title: Data Source # Data Source -TBC +This section contains information about the data sources used in Open Targets Genetics. + +We use GnomAD v4.0 as a source for variant annotation and GnomAD v2.1.1 as a source for linkage disequilibrium (LD) information (described in the **GnomAD** section). + +We rely on Open Targets as a source for the list of targets and the Gold Standard training set (described in the **Open Targets** section). + +## Study Sources + +1. GWAS catalog +2. FinnGen + +## Molecular QTLs + +1. eQTL catalogue + +## Interaction / Interval-based Experiments + +We integrate a list of studies that focus on interaction and interval-based investigations, shedding light on the intricate relationships between genetic elements and their functional implications. For more detils see section **"Intervals"**. diff --git a/docs/python_api/datasource/eqtl_catalogue/_eqtl_catalogue.md b/docs/python_api/datasource/eqtl_catalogue/_eqtl_catalogue.md new file mode 100644 index 000000000..5382d9d57 --- /dev/null +++ b/docs/python_api/datasource/eqtl_catalogue/_eqtl_catalogue.md @@ -0,0 +1,11 @@ +--- +title: eQTL Catalogue +--- + +The [eQTL Catalogue](https://www.ebi.ac.uk/eqtl/) aims to provide uniformly processed gene expression and splicing Quantitative Trait Loci (QTLs) from all available public studies on humans. + +It serves as the ultimate resource of eQTLs that we use for colocalization and target prioritization. + +We utilize data from the following study within the eQTL Catalogue: + +1. **GTEx v8**, 49 tissues diff --git a/docs/python_api/datasource/finngen/_finngen.md b/docs/python_api/datasource/finngen/_finngen.md index bb996d94b..158b738af 100644 --- a/docs/python_api/datasource/finngen/_finngen.md +++ b/docs/python_api/datasource/finngen/_finngen.md @@ -12,4 +12,6 @@ title: FinnGen } -FinnGen is a research project in genomics and personalized medicine. It is large public-private partnership that has collected and analysed genome and health data from 500,000 Finnish biobank donors to understand the genetic basis of diseases. FinnGen is a now expanding into understanding the progression and biological mechanisms of diseases. FinnGen provides a world-class resource for further breakthroughs in disease prevention, diagnosis, and treatment and a outlook into our genetic make-up. +[FinnGen](https://www.finngen.fi/en) is a research project in genomics and personalized medicine, representing a large public-private partnership. The project has collected and analyzed genome and health data from 500,000 Finnish biobank donors to understand the genetic basis of diseases. FinnGen is now expanding its focus to comprehend the progression and biological mechanisms of diseases. This initiative provides a world-class resource for further breakthroughs in disease prevention, diagnosis, and treatment, offering insights into our genetic makeup. + +For a comprehensive understanding of the dataset and methods, refer to [Kurki et al., 2023](https://www.nature.com/articles/s41586-022-05473-8). diff --git a/docs/python_api/datasource/gnomad/_gnomad.md b/docs/python_api/datasource/gnomad/_gnomad.md index 5d282c438..aee2de8ad 100644 --- a/docs/python_api/datasource/gnomad/_gnomad.md +++ b/docs/python_api/datasource/gnomad/_gnomad.md @@ -11,3 +11,9 @@ title: GnomAD display: none; } + +[GnomAD](https://gnomad.broadinstitute.org/) (Genome Aggregation Database) is a comprehensive resource that provides aggregated genomic data from large-scale sequencing projects. It encompasses variants from diverse populations and is widely used for variant annotation and population genetics studies. + +We use **GnomAD v4.0** as a source for variant annotation, offering detailed information about the prevalence and distribution of genetic variants across different populations. This version of GnomAD provides valuable insights into the genomic landscape, aiding in the interpretation of genetic variants and their potential functional implications. + +Additionally, [**GnomAD v2.1.1**](https://gnomad.broadinstitute.org/news/2018-10-gnomad-v2-1/) is utilized as a source for linkage disequilibrium (LD) information. diff --git a/docs/python_api/datasource/gwas_catalog/_gwas_catalog.md b/docs/python_api/datasource/gwas_catalog/_gwas_catalog.md index c70cd2353..4f05a4047 100644 --- a/docs/python_api/datasource/gwas_catalog/_gwas_catalog.md +++ b/docs/python_api/datasource/gwas_catalog/_gwas_catalog.md @@ -6,3 +6,17 @@ title: GWAS Catalog

GWAS Catalog

+ +The [GWAS Catalog](https://www.ebi.ac.uk/gwas/) is a comprehensive resource that aims to provide a curated collection of Genome-Wide Association Studies (GWAS) (including harmonized full GWAS summary statistics) across various traits and diseases in humans. + +It serves as a valuable repository of genetic associations identified in diverse populations, offering insights into the genetic basis of complex traits and diseases. + +We rely on the GWAS Catalog for a rich source of genetic associations, utilizing the data for analysis and interpretation. + +For detailed information on specific genetic associations, their significance, and associated studies, refer to the [GWAS Catalog](https://www.ebi.ac.uk/gwas/). + +Within our analyses, we leverage two different types of studies from the GWAS Catalog: + +1. **Studies with (full) GWAS summary stats** + +2. **Studies with top hits only - GWAS curated studies** diff --git a/docs/python_api/datasource/intervals/_intervals.md b/docs/python_api/datasource/intervals/_intervals.md index 73b73994e..f5e91028b 100644 --- a/docs/python_api/datasource/intervals/_intervals.md +++ b/docs/python_api/datasource/intervals/_intervals.md @@ -1,7 +1,25 @@ --- -title: Chromatin intevals +title: Interaction and Interval-based Studies --- -# Chromatin intervals +# List of Interaction and Interval-based Studies -TBC +In this section, we provide a list of studies that focus on interaction and interval-based investigations, shedding light on the intricate relationships between genetic elements and their functional implications. + +1. **Promoter Capture Hi-C (Javierre et al., 2016):** + _Title:_ "Lineage-Specific Genome Architecture Links Enhancers and Non-coding Disease Variants to Target Gene Promoters". + This study presents evidence linking genetic variation to genes through the application of Promoter Capture Hi-C across each of the 17 human primary hematopoietic cell types. The method captures interactions between promoters and distal regulatory elements, providing valuable insights into the three-dimensional chromatin architecture. DOI: 10.1016/j.cell.2016.09.037 + +2. **Enhancer-TSS Correlation (Andersson et al., 2014):** + _Title:_ "An Atlas of Active Enhancers across Human Cell Types and Tissues". + This study explores genetic variation's impact on genes by examining the correlation between the transcriptional activity of enhancers and transcription start sites. The findings are documented in the FANTOM5 CAGE expression atlas, offering a comprehensive view of the regulatory landscape. DOI: 10.1038/nature12787 + +3. **DHS-Promoter Correlation (Thurman et al., 2012):** + _Title:_ "The accessible chromatin landscape of the human genome". + Investigating genetic variation's connection to genes, this study employs the correlation of DNase I hypersensitive sites (DHS) and gene promoters. The analysis spans 125 cell and tissue types from the ENCODE project, providing a broad understanding of the regulatory interactions across diverse biological contexts. DOI: 10.1038/nature11232 + +4. **Promoter Capture Hi-C (Jung et al., 2019):** + _Title:_ "A compendium of promoter-centered long-range chromatin interactions in the human genome". + This study compiles a compendium of promoter-centered long-range chromatin interactions in the human genome. By focusing on the three-dimensional organization of chromatin, the research contributes to our understanding of the spatial arrangement of genetic elements and their implications in gene regulation. DOI: 10.1038/s41588-019-0494-8 + +For in-depth details on each study, you may refer to the respective publications. diff --git a/docs/python_api/datasource/open_targets/_open_targets.md b/docs/python_api/datasource/open_targets/_open_targets.md index 26a960e1e..4138f6597 100644 --- a/docs/python_api/datasource/open_targets/_open_targets.md +++ b/docs/python_api/datasource/open_targets/_open_targets.md @@ -12,6 +12,12 @@ title: Open Targets } -The Open Targets Platform is a comprehensive resource that aims to aggregate and harmonise various types of data to facilitate the identification, prioritisation, and validation of drug targets. By integrating publicly available datasets including data generated by the Open Targets consortium, the Platform builds and scores target-disease associations to assist in drug target identification and prioritisation. It also integrates relevant annotation information about targets, diseases, phenotypes, and drugs, as well as their most relevant relationships. +The Open Targets Platform is a comprehensive resource that aims to aggregate and harmonize various types of data to facilitate the identification, prioritization, and validation of drug targets. By integrating publicly available datasets, including data generated by the Open Targets consortium, the Platform builds and scores target-disease associations to assist in drug target identification and prioritization. It also integrates relevant annotation information about targets, diseases, phenotypes, and drugs, as well as their most relevant relationships. -Genomic data from Open Targets integrates human genome-wide association studies (GWAS) and functional genomics data including gene expression, protein abundance, chromatin interaction and conformation data from a wide range of cell types and tissues to make robust connections between GWAS-associated loci, variants and likely causal genes. +Within our analyses, we utilize Open Targets to infer two datasets: + +1. **The list of targets:** + This dataset provides a compilation of targets. In the Open Targets Platform, a target is understood as any naturally-occurring molecule that can be targeted by a medicinal product. The EMBL-EBI Ensembl database serves as the source for human targets in the Platform, with the Ensembl gene ID as the primary identifier. For more details, refer to [this link](https://platform-docs.opentargets.org/target). + +2. **The list of Gold Standard Positives:** + We use this dataset for training the Locus-to-Gene model. The current list contains 496 Gold Standard Positives. diff --git a/docs/python_api/datasource/ukbiobank/_ukbiobank.md b/docs/python_api/datasource/ukbiobank/_ukbiobank.md deleted file mode 100644 index 299a9c616..000000000 --- a/docs/python_api/datasource/ukbiobank/_ukbiobank.md +++ /dev/null @@ -1,24 +0,0 @@ ---- -title: UK Biobank ---- - -

- -

- - -The UK Biobank is a large-scale biomedical database and research resource that contains a diverse range of in-depth information from 500,000 volunteers in the United Kingdom. Its genomic data comprises whole-genome sequencing for a subset of participants, along with genotyping arrays for the entire cohort. The data has been a cornerstone for numerous genome-wide association studies (GWAS) and other genetic analyses, advancing our understanding of human health and disease. - -Recent efforts to rapidly and systematically apply established GWAS methods to all available data fields in UK Biobank have made available large repositories of summary statistics. To leverage these data disease locus discovery, we used full summary statistics from: -The Neale lab Round 2 (N=2139). - -- These analyses applied GWAS (implemented in Hail) to all data fields using imputed genotypes from HRC as released by UK Biobank in May 2017, consisting of 337,199 individuals post-QC. Full details of the Neale lab GWAS implementation are available here. We have remove all ICD-10 related traits from the Neale data to reduce overlap with the SAIGE results. -- http://www.nealelab.is/uk-biobank/ - The University of Michigan SAIGE analysis (N=1281). -- The SAIGE analysis uses PheCode derived phenotypes and applies a new method that "provides accurate P values even when case-control ratios are extremely unbalanced". See Zhou et al. (2018) for further details. -- https://pubmed.ncbi.nlm.nih.gov/30104761/ diff --git a/docs/python_api/datasource/ukbiobank/study_index.md b/docs/python_api/datasource/ukbiobank/study_index.md deleted file mode 100644 index 8b98cc31a..000000000 --- a/docs/python_api/datasource/ukbiobank/study_index.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -title: Study Index ---- - -::: otg.datasource.ukbiobank.study_index.UKBiobankStudyIndex From 2ca28fd7a84cd677573b1f0d5f5809c354ef0c4a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Irene=20L=C3=B3pez?= <45119610+ireneisdoomed@users.noreply.github.com> Date: Wed, 20 Dec 2023 10:04:50 +0000 Subject: [PATCH 06/21] fix: incorrect parsing of `app_name` in makefile (#367) * fix: correct app_name in makefile * chore: remove redundant dist cleaning * chore: streamline make rules dependencies --- Makefile | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/Makefile b/Makefile index 10df3d820..627385541 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ PROJECT_ID ?= open-targets-genetics-dev REGION ?= europe-west1 -APP_NAME ?= $$(cat pyproject.toml| grep name | cut -d" " -f3 | sed 's/"//g') +APP_NAME ?= $$(cat pyproject.toml| grep -m 1 "name" | cut -d" " -f3 | sed 's/"//g') VERSION_NO ?= $$(poetry version --short) CLEAN_VERSION_NO := $(shell echo "$(VERSION_NO)" | tr -cd '[:alnum:]') BUCKET_NAME=gs://genetics_etl_python_playground/initialisation/${VERSION_NO}/ @@ -35,8 +35,7 @@ build-documentation: ## Create local server with documentation @echo "Building Documentation..." @poetry run mkdocs serve -create-dev-cluster: ## Spin up a simple dataproc cluster with all dependencies for development purposes - @${MAKE} build +create-dev-cluster: build ## Spin up a simple dataproc cluster with all dependencies for development purposes @echo "Creating Dataproc Dev Cluster" @gcloud config set project ${PROJECT_ID} @gcloud dataproc clusters create "ot-genetics-dev-${CLEAN_VERSION_NO}" \ @@ -49,8 +48,7 @@ create-dev-cluster: ## Spin up a simple dataproc cluster with all dependencies f --optional-components=JUPYTER \ --enable-component-gateway -make update-dev-cluster: ## Reinstalls the package on the dev-cluster - @${MAKE} build +make update-dev-cluster: build ## Reinstalls the package on the dev-cluster @echo "Updating Dataproc Dev Cluster" @gcloud config set project ${PROJECT_ID} gcloud dataproc jobs submit pig --cluster="ot-genetics-dev-${CLEAN_VERSION_NO}" \ @@ -61,7 +59,6 @@ make update-dev-cluster: ## Reinstalls the package on the dev-cluster build: clean ## Build Python package with dependencies @gcloud config set project ${PROJECT_ID} @echo "Packaging Code and Dependencies for ${APP_NAME}-${VERSION_NO}" - @rm -rf ./dist @poetry build @tar -czf dist/config.tar.gz config/ @echo "Uploading to Dataproc" From fbf21eee11c1c718b72e6d3a3bdde399215918cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Irene=20L=C3=B3pez?= <45119610+ireneisdoomed@users.noreply.github.com> Date: Wed, 20 Dec 2023 10:48:40 +0000 Subject: [PATCH 07/21] ci: set codecov default branch to dev (#368) --- codecov.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/codecov.yml b/codecov.yml index bc3b704f2..945dba691 100644 --- a/codecov.yml +++ b/codecov.yml @@ -1,3 +1,6 @@ +codecov: + branch: dev + comment: layout: "reach, diff, flags, files" behavior: default From 0986138390efea4582634d927f15d6fc01388025 Mon Sep 17 00:00:00 2001 From: David Ochoa Date: Thu, 21 Dec 2023 12:10:40 +0000 Subject: [PATCH 08/21] feat: Finngen R10 harmonisation and preprocessing (#370) * chore: remove unnecessary file * fix: several fixes on finngen harmonisation and preprocess * docs: update docs * fix: test * fix: uncomment line Co-authored-by: Daniel Suveges --------- Co-authored-by: Daniel Suveges --- config/datasets/gcp.yaml | 1 - config/step/finngen.yaml | 3 - config/step/finngen_studies.yaml | 2 + config/step/finngen_sumstat_preprocess.yaml | 3 + docs/python_api/step/finngen.md | 5 -- docs/python_api/step/finngen_studies.md | 5 ++ .../step/finngen_sumstat_preprocess.md | 5 ++ src/airflow/dags/finngen_harmonisation.py | 77 +++++++++++++++++++ src/airflow/dags/finngen_preprocess.py | 31 ++++---- src/otg/datasource/finngen/study_index.py | 8 +- src/otg/datasource/finngen/summary_stats.py | 14 ++-- src/otg/finngen.py | 50 ------------ src/otg/finngen_studies.py | 31 ++++++++ src/otg/finngen_sumstat_preprocess.py | 36 +++++++++ .../finngen/test_finngen_summary_stats.py | 2 +- 15 files changed, 190 insertions(+), 83 deletions(-) delete mode 100644 config/step/finngen.yaml create mode 100644 config/step/finngen_studies.yaml create mode 100644 config/step/finngen_sumstat_preprocess.yaml delete mode 100644 docs/python_api/step/finngen.md create mode 100644 docs/python_api/step/finngen_studies.md create mode 100644 docs/python_api/step/finngen_sumstat_preprocess.md create mode 100644 src/airflow/dags/finngen_harmonisation.py delete mode 100644 src/otg/finngen.py create mode 100644 src/otg/finngen_studies.py create mode 100644 src/otg/finngen_sumstat_preprocess.py diff --git a/config/datasets/gcp.yaml b/config/datasets/gcp.yaml index c198b4d96..e8d949ecd 100644 --- a/config/datasets/gcp.yaml +++ b/config/datasets/gcp.yaml @@ -24,7 +24,6 @@ catalog_sumstats_lut: ${datasets.inputs}/v2d/harmonised_list-r2023-11-24a.txt ukbiobank_manifest: gs://genetics-portal-input/ukb_phenotypes/neale2_saige_study_manifest.190430.tsv l2g_gold_standard_curation: ${datasets.inputs}/l2g/gold_standard/curation.json gene_interactions: ${datasets.inputs}/l2g/interaction # 23.09 data -finngen_phenotype_table_url: https://r9.finngen.fi/api/phenos eqtl_catalogue_paths_imported: ${datasets.inputs}/preprocess/eqtl_catalogue/tabix_ftp_paths_imported.tsv # Output datasets diff --git a/config/step/finngen.yaml b/config/step/finngen.yaml deleted file mode 100644 index fb049db37..000000000 --- a/config/step/finngen.yaml +++ /dev/null @@ -1,3 +0,0 @@ -_target_: otg.finngen.FinnGenStep -finngen_study_index_out: ${datasets.finngen_study_index} -finngen_summary_stats_out: ${datasets.finngen_summary_stats} diff --git a/config/step/finngen_studies.yaml b/config/step/finngen_studies.yaml new file mode 100644 index 000000000..23b58c443 --- /dev/null +++ b/config/step/finngen_studies.yaml @@ -0,0 +1,2 @@ +_target_: otg.finngen_studies.FinnGenStudiesStep +finngen_study_index_out: ${datasets.finngen_study_index} diff --git a/config/step/finngen_sumstat_preprocess.yaml b/config/step/finngen_sumstat_preprocess.yaml new file mode 100644 index 000000000..319e7af63 --- /dev/null +++ b/config/step/finngen_sumstat_preprocess.yaml @@ -0,0 +1,3 @@ +_target_: otg.finngen_sumstat_preprocess.FinnGenSumstatPreprocessStep +raw_sumstats_path: ??? +out_sumstats_path: ??? diff --git a/docs/python_api/step/finngen.md b/docs/python_api/step/finngen.md deleted file mode 100644 index fedefae50..000000000 --- a/docs/python_api/step/finngen.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -title: FinnGen ---- - -::: otg.finngen.FinnGenStep diff --git a/docs/python_api/step/finngen_studies.md b/docs/python_api/step/finngen_studies.md new file mode 100644 index 000000000..cfd7342e9 --- /dev/null +++ b/docs/python_api/step/finngen_studies.md @@ -0,0 +1,5 @@ +--- +title: FinnGen Studies +--- + +::: otg.finngen_studies.FinnGenStudiesStep diff --git a/docs/python_api/step/finngen_sumstat_preprocess.md b/docs/python_api/step/finngen_sumstat_preprocess.md new file mode 100644 index 000000000..0b374a278 --- /dev/null +++ b/docs/python_api/step/finngen_sumstat_preprocess.md @@ -0,0 +1,5 @@ +--- +title: FinnGen Preprocess Summary Stats +--- + +::: otg.finngen_sumstat_preprocess.FinnGenSumstatPreprocessStep diff --git a/src/airflow/dags/finngen_harmonisation.py b/src/airflow/dags/finngen_harmonisation.py new file mode 100644 index 000000000..ad88e695c --- /dev/null +++ b/src/airflow/dags/finngen_harmonisation.py @@ -0,0 +1,77 @@ +"""Airflow DAG for the harmonisation part of the pipeline.""" +from __future__ import annotations + +import re +import time +from pathlib import Path +from typing import Any + +import common_airflow as common +from airflow.decorators import task +from airflow.models.dag import DAG +from airflow.providers.google.cloud.operators.gcs import GCSListObjectsOperator + +CLUSTER_NAME = "otg-finngen-harmonisation" +AUTOSCALING = "gwascatalog-harmonisation" # same as GWAS Catalog harmonisation +SUMMARY_STATS_BUCKET_NAME = "finngen-public-data-r10" +RELEASEBUCKET = "gs://genetics_etl_python_playground/output/python_etl/parquet/XX.XX" +SUMSTATS_PARQUET = f"{RELEASEBUCKET}/summary_statistics/finngen" + +with DAG( + dag_id=Path(__file__).stem, + description="Open Targets Genetics โ€” Finngen harmonisation", + default_args=common.shared_dag_args, + **common.shared_dag_kwargs, +): + # List raw harmonised files from GWAS Catalog + list_inputs = GCSListObjectsOperator( + task_id="list_raw_sumstats", + bucket=SUMMARY_STATS_BUCKET_NAME, + prefix="summary_stats", + match_glob="**/*.gz", + ) + + # Submit jobs to dataproc + @task(task_id="submit_jobs") + def submit_jobs(**kwargs: Any) -> None: + """Submit jobs to dataproc. + + Args: + **kwargs (Any): Keyword arguments. + """ + ti = kwargs["ti"] + todo = ti.xcom_pull(task_ids="list_raw_sumstats", key="return_value") + print("Number of jobs to submit: ", len(todo)) # noqa: T201 + for i in range(len(todo)): + # Not to exceed default quota 400 jobs per minute + if i > 0 and i % 399 == 0: + time.sleep(60) + input_path = todo[i] + match_result = re.search(r"summary_stats/finngen_(.*).gz", input_path) + if match_result: + study_id = match_result.group(1) + print("Submitting job for study: ", study_id) # noqa: T201 + common.submit_pyspark_job_no_operator( + cluster_name=CLUSTER_NAME, + step_id="finngen_sumstat_preprocess", + other_args=[ + f"step.raw_sumstats_path=gs://{SUMMARY_STATS_BUCKET_NAME}/{input_path}", + f"step.out_sumstats_path={SUMSTATS_PARQUET}/{study_id}.parquet", + ], + ) + + # list_inputs >> + ( + list_inputs + >> common.create_cluster( + CLUSTER_NAME, + autoscaling_policy=AUTOSCALING, + num_workers=8, + # num_preemptible_workers=8, + master_machine_type="n1-highmem-32", + worker_machine_type="n1-standard-2", + ) + >> common.install_dependencies(CLUSTER_NAME) + >> submit_jobs() + >> common.delete_cluster(CLUSTER_NAME) + ) diff --git a/src/airflow/dags/finngen_preprocess.py b/src/airflow/dags/finngen_preprocess.py index e1febd34f..3ca5f3907 100644 --- a/src/airflow/dags/finngen_preprocess.py +++ b/src/airflow/dags/finngen_preprocess.py @@ -11,12 +11,12 @@ AUTOSCALING = "finngen-preprocess" RELEASEBUCKET = "gs://genetics_etl_python_playground/output/python_etl/parquet/XX.XX" -SUMSTATS = "{RELEASEBUCKET}/summary_statistics/finngen" +SUMSTATS = f"{RELEASEBUCKET}/summary_statistics/finngen" WINDOWBASED_CLUMPED = ( - "{RELEASEBUCKET}/study_locus/from_sumstats_study_locus_window_clumped/finngen" + f"{RELEASEBUCKET}/study_locus/from_sumstats_study_locus_window_clumped/finngen" ) -LD_CLUMPED = "{RELEASEBUCKET}/study_locus/from_sumstats_study_locus_ld_clumped/finngen" -PICSED = "{RELEASEBUCKET}/credible_set/from_sumstats_study_locus/finngen" +LD_CLUMPED = f"{RELEASEBUCKET}/study_locus/from_sumstats_study_locus_ld_clumped/finngen" +PICSED = f"{RELEASEBUCKET}/credible_set/from_sumstats_study_locus/finngen" with DAG( dag_id=Path(__file__).stem, @@ -24,10 +24,10 @@ default_args=common.shared_dag_args, **common.shared_dag_kwargs, ): - study_and_sumstats = common.submit_step( + study_index = common.submit_step( cluster_name=CLUSTER_NAME, - step_id="finngen", - task_id="finngen_sumstats_and_study_index", + step_id="finngen_studies", + task_id="finngen_studies", ) window_based_clumping = common.submit_step( @@ -35,8 +35,8 @@ step_id="clump", task_id="finngen_window_based_clumping", other_args=[ - "step.input_path={SUMSTATS}", - "step.clumped_study_locus_path={WINDOWBASED_CLUMPED}", + f"step.input_path={SUMSTATS}", + f"step.clumped_study_locus_path={WINDOWBASED_CLUMPED}", ], ) ld_clumping = common.submit_step( @@ -44,8 +44,10 @@ step_id="clump", task_id="finngen_ld_clumping", other_args=[ - "step.input_path={WINDOWBASED_CLUMPED}", - "step.clumped_study_locus_path={LD_CLUMPED}", + f"step.input_path={WINDOWBASED_CLUMPED}", + f"step.ld_index_path={RELEASEBUCKET}/ld_index", + f"step.study_index_path={RELEASEBUCKET}/study_index/finngen", + f"step.clumped_study_locus_path={LD_CLUMPED}", ], trigger_rule=TriggerRule.ALL_DONE, ) @@ -64,10 +66,13 @@ ( common.create_cluster( - CLUSTER_NAME, autoscaling_policy=AUTOSCALING, master_disk_size=2000 + CLUSTER_NAME, + autoscaling_policy=AUTOSCALING, + master_disk_size=2000, + num_workers=6, ) >> common.install_dependencies(CLUSTER_NAME) - >> study_and_sumstats + >> study_index >> window_based_clumping >> ld_clumping >> pics diff --git a/src/otg/datasource/finngen/study_index.py b/src/otg/datasource/finngen/study_index.py index 0ebd1438a..5ab30ebe0 100644 --- a/src/otg/datasource/finngen/study_index.py +++ b/src/otg/datasource/finngen/study_index.py @@ -14,7 +14,7 @@ class FinnGenStudyIndex: The following information is aggregated/extracted: - - Study ID in the special format (FINNGEN_R9_*) + - Study ID in the special format (e.g. FINNGEN_R10_*) - Trait name (for example, Amoebiasis) - Number of cases and controls - Link to the summary statistics location @@ -22,10 +22,10 @@ class FinnGenStudyIndex: Some fields are also populated as constants, such as study type and the initial sample size. """ - finngen_phenotype_table_url: str = "https://r9.finngen.fi/api/phenos" - finngen_release_prefix: str = "FINNGEN_R9" + finngen_phenotype_table_url: str = "https://r10.finngen.fi/api/phenos" + finngen_release_prefix: str = "FINNGEN_R10" finngen_summary_stats_url_prefix: str = ( - "gs://finngen-public-data-r9/summary_stats/finngen_R9_" + "gs://finngen-public-data-r10/summary_stats/finngen_R10_" ) finngen_summary_stats_url_suffix: str = ".gz" diff --git a/src/otg/datasource/finngen/summary_stats.py b/src/otg/datasource/finngen/summary_stats.py index 281792a08..8fc966c5b 100644 --- a/src/otg/datasource/finngen/summary_stats.py +++ b/src/otg/datasource/finngen/summary_stats.py @@ -39,28 +39,27 @@ class FinnGenSummaryStats: def from_source( cls: type[FinnGenSummaryStats], spark: SparkSession, - raw_files: list[str], + raw_file: str, ) -> SummaryStatistics: """Ingests all summary statst for all FinnGen studies. Args: spark (SparkSession): Spark session object. - raw_files (list[str]): Paths to raw summary statistics .gz files. + raw_file (str): Path to raw summary statistics .gz files. Returns: SummaryStatistics: Processed summary statistics dataset """ + study_id = raw_file.split("/")[-1].split(".")[0].upper() processed_summary_stats_df = ( spark.read.schema(cls.raw_schema) .option("delimiter", "\t") - .csv(raw_files, header=True) + .csv(raw_file, header=True) # Drop rows which don't have proper position. .filter(f.col("pos").cast(t.IntegerType()).isNotNull()) .select( # From the full path, extracts just the filename, and converts to upper case to get the study ID. - f.upper(f.regexp_extract(f.input_file_name(), r"([^/]+)\.gz", 1)).alias( - "studyId" - ), + f.lit(study_id).alias("studyId"), # Add variant information. f.concat_ws( "_", @@ -82,6 +81,9 @@ def from_source( .filter( f.col("pos").cast(t.IntegerType()).isNotNull() & (f.col("beta") != 0) ) + # Average ~20Mb partitions with 30 partitions per study + .repartitionByRange(30, "chromosome", "position") + .sortWithinPartitions("chromosome", "position") ) # Initializing summary statistics object: diff --git a/src/otg/finngen.py b/src/otg/finngen.py deleted file mode 100644 index 6b179e2c5..000000000 --- a/src/otg/finngen.py +++ /dev/null @@ -1,50 +0,0 @@ -"""Step to run FinnGen study table ingestion.""" - -from __future__ import annotations - -from dataclasses import dataclass - -from omegaconf import MISSING - -from otg.common.session import Session -from otg.datasource.finngen.study_index import FinnGenStudyIndex -from otg.datasource.finngen.summary_stats import FinnGenSummaryStats - - -@dataclass -class FinnGenStep: - """FinnGen ingestion step. - - Attributes: - session (Session): Session object. - finngen_study_index_out (str): Output path for the FinnGen study index dataset. - finngen_summary_stats_out (str): Output path for the FinnGen summary statistics. - """ - - session: Session = MISSING - finngen_study_index_out: str = MISSING - finngen_summary_stats_out: str = MISSING - - def __post_init__(self: FinnGenStep) -> None: - """Run step.""" - # Fetch study index. - # Process study index. - study_index = FinnGenStudyIndex.from_source(self.session.spark) - # Write study index. - study_index.df.write.mode(self.session.write_mode).parquet( - self.finngen_study_index_out - ) - - # Fetch summary stats locations - input_filenames = [row.summarystatsLocation for row in study_index.df.collect()] - # Process summary stats. - summary_stats = FinnGenSummaryStats.from_source( - self.session.spark, raw_files=input_filenames - ) - - # Write summary stats. - ( - summary_stats.df.write.partitionBy("studyId") - .mode(self.session.write_mode) - .parquet(self.finngen_summary_stats_out) - ) diff --git a/src/otg/finngen_studies.py b/src/otg/finngen_studies.py new file mode 100644 index 000000000..9a1d800e8 --- /dev/null +++ b/src/otg/finngen_studies.py @@ -0,0 +1,31 @@ +"""Step to run FinnGen study table ingestion.""" + +from __future__ import annotations + +from dataclasses import dataclass + +from omegaconf import MISSING + +from otg.common.session import Session +from otg.datasource.finngen.study_index import FinnGenStudyIndex + + +@dataclass +class FinnGenStudiesStep: + """FinnGen study index generation step. + + Attributes: + session (Session): Session object. + finngen_study_index_out (str): Output path for the FinnGen study index dataset. + """ + + session: Session = MISSING + finngen_study_index_out: str = MISSING + finngen_summary_stats_out: str = MISSING + + def __post_init__(self: FinnGenStudiesStep) -> None: + """Run step.""" + # Fetch study index. + FinnGenStudyIndex.from_source(self.session.spark).df.write.mode( + self.session.write_mode + ).parquet(self.finngen_study_index_out) diff --git a/src/otg/finngen_sumstat_preprocess.py b/src/otg/finngen_sumstat_preprocess.py new file mode 100644 index 000000000..959c000dc --- /dev/null +++ b/src/otg/finngen_sumstat_preprocess.py @@ -0,0 +1,36 @@ +"""Step to run FinnGen study table ingestion.""" + +from __future__ import annotations + +from dataclasses import dataclass + +from omegaconf import MISSING + +from otg.common.session import Session +from otg.datasource.finngen.summary_stats import FinnGenSummaryStats + + +@dataclass +class FinnGenSumstatPreprocessStep: + """FinnGen sumstats preprocessing. + + Attributes: + session (Session): Session object. + finngen_study_index_out (str): Output path for the FinnGen study index dataset. + finngen_summary_stats_out (str): Output path for the FinnGen summary statistics. + """ + + session: Session = MISSING + raw_sumstats_path: str = MISSING + out_sumstats_path: str = MISSING + + def __post_init__(self: FinnGenSumstatPreprocessStep) -> None: + """Run step.""" + # Process summary stats. + ( + FinnGenSummaryStats.from_source( + self.session.spark, raw_file=self.raw_sumstats_path + ) + .df.write.mode(self.session.write_mode) + .parquet(self.out_sumstats_path) + ) diff --git a/tests/datasource/finngen/test_finngen_summary_stats.py b/tests/datasource/finngen/test_finngen_summary_stats.py index 3a16d9a57..315d8cd64 100644 --- a/tests/datasource/finngen/test_finngen_summary_stats.py +++ b/tests/datasource/finngen/test_finngen_summary_stats.py @@ -12,7 +12,7 @@ def test_finngen_summary_stats_from_source(spark: SparkSession) -> None: assert isinstance( FinnGenSummaryStats.from_source( spark=spark, - raw_files=["tests/data_samples/finngen_R9_AB1_ACTINOMYCOSIS.gz"], + raw_file="tests/data_samples/finngen_R9_AB1_ACTINOMYCOSIS.gz", ), SummaryStatistics, ) From cc3e26fbd2f5b82aaac05ffef04608c1ba1e03b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Irene=20L=C3=B3pez?= <45119610+ireneisdoomed@users.noreply.github.com> Date: Thu, 21 Dec 2023 13:19:40 +0100 Subject: [PATCH 09/21] feat(pics): remove variants from `locus` when PICS cannot be applied (#361) * feat(pics): variants not in locus when if pips cant be calculated * feat(pics): add empty_locus qc flag * chore(pics): add to finemappingMethod column * refactor(pics): change definition of non picsable based on ldset * Update src/otg/dataset/study_locus.py Co-authored-by: David Ochoa * Update src/otg/method/pics.py Co-authored-by: David Ochoa * Update tests/method/test_pics.py Co-authored-by: David Ochoa --------- Co-authored-by: David Ochoa Co-authored-by: Daniel Suveges --- src/otg/dataset/study_locus.py | 2 ++ src/otg/method/pics.py | 23 ++++++++++++++++++----- tests/method/test_pics.py | 17 ++++++++++++++++- 3 files changed, 36 insertions(+), 6 deletions(-) diff --git a/src/otg/dataset/study_locus.py b/src/otg/dataset/study_locus.py index 64fcf4931..d84f40e55 100644 --- a/src/otg/dataset/study_locus.py +++ b/src/otg/dataset/study_locus.py @@ -37,6 +37,7 @@ class StudyLocusQualityCheck(Enum): AMBIGUOUS_STUDY (str): Association with ambiguous study UNRESOLVED_LD (str): Variant not found in LD reference LD_CLUMPED (str): Explained by a more significant variant in high LD (clumped) + UNPICSABLE (str): Unable to calculate PIPs with the provided data """ SUBSIGNIFICANT_FLAG = "Subsignificant p-value" @@ -49,6 +50,7 @@ class StudyLocusQualityCheck(Enum): UNRESOLVED_LD = "Variant not found in LD reference" LD_CLUMPED = "Explained by a more significant variant in high LD (clumped)" NO_POPULATION = "Study does not have population annotation to resolve LD" + NOT_QUALIFYING_LD_BLOCK = "LD block does not contain variants at the required R^2 threshold" class CredibleInterval(Enum): diff --git a/src/otg/method/pics.py b/src/otg/method/pics.py index daff8dbf2..6d38f4643 100644 --- a/src/otg/method/pics.py +++ b/src/otg/method/pics.py @@ -8,7 +8,7 @@ import pyspark.sql.types as t from scipy.stats import norm -from otg.dataset.study_locus import StudyLocus +from otg.dataset.study_locus import StudyLocus, StudyLocusQualityCheck if TYPE_CHECKING: from pyspark.sql import Row @@ -127,7 +127,7 @@ def _finemap( ... Row(variantId="var2", r2Overall=None), ... ] >>> PICS._finemap(ld_set_with_no_r2, lead_neglog_p=10.0, k=6.4) - [{'variantId': 'var1', 'r2Overall': None}, {'variantId': 'var2', 'r2Overall': None}] + [] """ if ld_set is None: return None @@ -145,8 +145,7 @@ def _finemap( or tag_dict["r2Overall"] < 0.5 or not lead_neglog_p ): - # If PICS cannot be calculated, we'll return the original credible set - new_credible_set.append(tag_dict) + # If PICS cannot be calculated, we drop the variant from the credible set continue pics_snp_mu = PICS._pics_mu(lead_neglog_p, tag_dict["r2Overall"]) @@ -222,6 +221,9 @@ def finemap( lambda locus, neglog_p: PICS._finemap(locus, neglog_p, k), picsed_ldset_schema, ) + non_picsable_expr = ( + f.size(f.filter(f.col("ldSet"), lambda x: x.r2Overall >= 0.5)) == 0 + ) return StudyLocus( _df=( associations.df @@ -239,7 +241,18 @@ def finemap( ), ), ) - # Rename tagVariantId to variantId + .withColumn( + "qualityControls", + StudyLocus.update_quality_flag( + f.col("qualityControls"), + non_picsable_expr, + StudyLocusQualityCheck.NOT_QUALIFYING_LD_BLOCK, + ), + ) + .withColumn( + "finemappingMethod", + f.coalesce(f.col("finemappingMethod"), f.lit("pics")), + ) .drop("neglog_pvalue") ), _schema=StudyLocus.get_schema(), diff --git a/tests/method/test_pics.py b/tests/method/test_pics.py index 7d2f1f78e..41c9c5c20 100644 --- a/tests/method/test_pics.py +++ b/tests/method/test_pics.py @@ -30,11 +30,26 @@ def test_finemap_empty_array( def test_finemap_null_ld_set( self: TestFinemap, mock_study_locus: StudyLocus ) -> None: - """Test how we apply `finemap` when `locus` is null by returning a null field.""" + """Test how we apply `finemap` when `ldSet` is null by returning a null field.""" mock_study_locus.df = mock_study_locus.df.filter(f.col("ldSet").isNull()) observed_df = PICS.finemap(mock_study_locus).df.limit(1) assert observed_df.collect()[0]["locus"] is None + def test_finemap_quality_control( + self: TestFinemap, mock_study_locus: StudyLocus + ) -> None: + """Test that we add a `empty locus` flag when any variant in the locus meets PICS criteria.""" + mock_study_locus.df = mock_study_locus.df.withColumn( + # Association with an empty ldSet + "ldSet", + f.when(f.col("ldSet").isNull(), f.array()).otherwise(f.col("ldSet")), + ).filter(f.size("ldSet") == 0) + observed_df = PICS.finemap(mock_study_locus).df.limit(1) + qc_flag = "LD block does not contain variants at the required R^2 threshold" + assert ( + qc_flag in observed_df.collect()[0]["qualityControls"] + ), "Empty locus QC flag is missing." + def test__finemap_udf() -> None: """Test the _finemap UDF with a simple case.""" From d12cbae08f58792417a04b01c2318bfb302c19a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Irene=20L=C3=B3pez?= <45119610+ireneisdoomed@users.noreply.github.com> Date: Thu, 21 Dec 2023 15:48:56 +0100 Subject: [PATCH 10/21] chore(study_index): change numeric columns from long to integers (#371) * chore(study_index): change numeric columns to integers * chore(study_index): accommodate parsers to schema changes --- src/otg/assets/schemas/study_index.json | 10 +++++----- src/otg/datasource/eqtl_catalogue/study_index.py | 10 +++++----- src/otg/datasource/finngen/study_index.py | 10 ++++++---- src/otg/datasource/gwas_catalog/study_index.py | 14 +++++++------- src/otg/datasource/ukbiobank/study_index.py | 4 ++-- 5 files changed, 25 insertions(+), 23 deletions(-) diff --git a/src/otg/assets/schemas/study_index.json b/src/otg/assets/schemas/study_index.json index 71c600952..f7e577921 100644 --- a/src/otg/assets/schemas/study_index.json +++ b/src/otg/assets/schemas/study_index.json @@ -89,19 +89,19 @@ }, { "name": "nCases", - "type": "long", + "type": "integer", "nullable": true, "metadata": {} }, { "name": "nControls", - "type": "long", + "type": "integer", "nullable": true, "metadata": {} }, { "name": "nSamples", - "type": "long", + "type": "integer", "nullable": true, "metadata": {} }, @@ -150,7 +150,7 @@ "fields": [ { "name": "sampleSize", - "type": "long", + "type": "integer", "nullable": true, "metadata": {} }, @@ -176,7 +176,7 @@ "fields": [ { "name": "sampleSize", - "type": "long", + "type": "integer", "nullable": true, "metadata": {} }, diff --git a/src/otg/datasource/eqtl_catalogue/study_index.py b/src/otg/datasource/eqtl_catalogue/study_index.py index ff1b97d22..760cb3a74 100644 --- a/src/otg/datasource/eqtl_catalogue/study_index.py +++ b/src/otg/datasource/eqtl_catalogue/study_index.py @@ -51,23 +51,23 @@ def _all_attributes() -> List[Column]: ).alias("traitFromSourceMappedIds"), ] sample_attributes = [ - f.lit(838).cast("long").alias("nSamples"), + f.lit(838).cast("integer").alias("nSamples"), f.lit("838 (281 females and 557 males)").alias("initialSampleSize"), f.array( f.struct( - f.lit(715).cast("long").alias("sampleSize"), + f.lit(715).cast("integer").alias("sampleSize"), f.lit("European American").alias("ancestry"), ), f.struct( - f.lit(103).cast("long").alias("sampleSize"), + f.lit(103).cast("integer").alias("sampleSize"), f.lit("African American").alias("ancestry"), ), f.struct( - f.lit(12).cast("long").alias("sampleSize"), + f.lit(12).cast("integer").alias("sampleSize"), f.lit("Asian American").alias("ancestry"), ), f.struct( - f.lit(16).cast("long").alias("sampleSize"), + f.lit(16).cast("integer").alias("sampleSize"), f.lit("Hispanic or Latino").alias("ancestry"), ), ).alias("discoverySamples"), diff --git a/src/otg/datasource/finngen/study_index.py b/src/otg/datasource/finngen/study_index.py index 5ab30ebe0..b542ac655 100644 --- a/src/otg/datasource/finngen/study_index.py +++ b/src/otg/datasource/finngen/study_index.py @@ -51,9 +51,11 @@ def from_source( f.lit(f"{cls.finngen_release_prefix}_"), f.col("phenocode") ).alias("studyId"), f.col("phenostring").alias("traitFromSource"), - f.col("num_cases").alias("nCases"), - f.col("num_controls").alias("nControls"), - (f.col("num_cases") + f.col("num_controls")).alias("nSamples"), + f.col("num_cases").cast("integer").alias("nCases"), + f.col("num_controls").cast("integer").alias("nControls"), + (f.col("num_cases") + f.col("num_controls")) + .cast("integer") + .alias("nSamples"), f.lit(cls.finngen_release_prefix).alias("projectId"), f.lit("gwas").alias("studyType"), f.lit(True).alias("hasSumstats"), @@ -62,7 +64,7 @@ def from_source( ), f.array( f.struct( - f.lit(377277).cast("long").alias("sampleSize"), + f.lit(377277).cast("integer").alias("sampleSize"), f.lit("Finnish").alias("ancestry"), ) ).alias("discoverySamples"), diff --git a/src/otg/datasource/gwas_catalog/study_index.py b/src/otg/datasource/gwas_catalog/study_index.py index 4a0b020e9..27988c29d 100644 --- a/src/otg/datasource/gwas_catalog/study_index.py +++ b/src/otg/datasource/gwas_catalog/study_index.py @@ -70,7 +70,7 @@ def _parse_discovery_samples(discovery_samples: Column) -> Column: +--------------------------------------------+ """ - # To initialize return objects for aggregate functions, schema has to be definied: + # To initialize return objects for aggregate functions, schema has to be defined: schema = t.ArrayType( t.StructType( [ @@ -96,7 +96,7 @@ def _parse_discovery_samples(discovery_samples: Column) -> Column: ), lambda ancestry: f.struct( ancestry.alias("ancestry"), - f.lit(0).cast(t.LongType()).alias("sampleSize"), + f.lit(0).alias("sampleSize"), ), ) @@ -155,7 +155,7 @@ def _normalize_ancestries(merged: Column, ancestry: Column) -> Column: f.struct( a.ancestry.alias("ancestry"), (a.sampleSize + ancestry.sampleSize) - .cast(t.LongType()) + .cast(t.IntegerType()) .alias("sampleSize"), ), ).otherwise(a), @@ -387,7 +387,7 @@ def annotate_ancestries( f.struct( f.col("broadAncestralCategory").alias("ancestry"), f.col("numberOfIndividuals") - .cast(t.LongType()) + .cast(t.IntegerType()) .alias("sampleSize"), ) ) @@ -543,9 +543,9 @@ def annotate_discovery_sample_sizes( # Aggregating sample sizes for all ancestries: .groupBy("studyId") # studyId has not been split yet .agg( - f.sum("nCases").alias("nCases"), - f.sum("nControls").alias("nControls"), - f.sum("sampleSize").alias("nSamples"), + f.sum("nCases").cast("integer").alias("nCases"), + f.sum("nControls").cast("integer").alias("nControls"), + f.sum("sampleSize").cast("integer").alias("nSamples"), ) ) self.df = self.df.join(sample_size_lut, on="studyId", how="left") diff --git a/src/otg/datasource/ukbiobank/study_index.py b/src/otg/datasource/ukbiobank/study_index.py index 081a852ed..7ef33f46c 100644 --- a/src/otg/datasource/ukbiobank/study_index.py +++ b/src/otg/datasource/ukbiobank/study_index.py @@ -76,10 +76,10 @@ def from_source( "publicationJournal" ), f.col("n_total").cast("string").alias("initialSampleSize"), - f.col("n_cases").cast("long").alias("nCases"), + f.col("n_cases").cast("integer").alias("nCases"), f.array( f.struct( - f.col("n_total").cast("long").alias("sampleSize"), + f.col("n_total").cast("integer").alias("sampleSize"), f.concat(f.lit("European="), f.col("n_total")).alias( "ancestry" ), From fc5d897b8a8cfa547170786c0f92008553947cc0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Irene=20L=C3=B3pez?= <45119610+ireneisdoomed@users.noreply.github.com> Date: Thu, 21 Dec 2023 16:01:49 +0100 Subject: [PATCH 11/21] feat(l2g): add features based on predicted variant consequences (#360) * chore: import wandb classes explicitly * feat(l2g): add studylocusfeaturefactory._get_vep_features * chore: accommodate project to newer features * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * feat(l2g): include averaged features --- .../assets/schemas/l2g_feature_matrix.json | 12 ++ src/otg/dataset/l2g_feature_matrix.py | 1 + src/otg/l2g.py | 6 +- src/otg/method/l2g/feature_factory.py | 126 ++++++++++++++++++ tests/method/test_locus_to_gene.py | 9 ++ 5 files changed, 153 insertions(+), 1 deletion(-) diff --git a/src/otg/assets/schemas/l2g_feature_matrix.json b/src/otg/assets/schemas/l2g_feature_matrix.json index 4388d0861..c88bc21af 100644 --- a/src/otg/assets/schemas/l2g_feature_matrix.json +++ b/src/otg/assets/schemas/l2g_feature_matrix.json @@ -30,6 +30,18 @@ "nullable": true, "type": "float" }, + { + "metadata": {}, + "name": "vepMaximumNeighborhood", + "nullable": true, + "type": "float" + }, + { + "metadata": {}, + "name": "vepMaximum", + "nullable": true, + "type": "float" + }, { "metadata": {}, "name": "eqtlColocClppLocalMaximum", diff --git a/src/otg/dataset/l2g_feature_matrix.py b/src/otg/dataset/l2g_feature_matrix.py index 32bd8d06d..7952a1b37 100644 --- a/src/otg/dataset/l2g_feature_matrix.py +++ b/src/otg/dataset/l2g_feature_matrix.py @@ -65,6 +65,7 @@ def generate_features( # study_locus, study_index, colocalisation # ).df, StudyLocusFactory._get_tss_distance_features(study_locus, variant_gene).df, + StudyLocusFactory._get_vep_features(study_locus, variant_gene).df, ]: fm = reduce( lambda x, y: x.unionByName(y), diff --git a/src/otg/l2g.py b/src/otg/l2g.py index c93906d22..701bad308 100644 --- a/src/otg/l2g.py +++ b/src/otg/l2g.py @@ -66,7 +66,11 @@ class LocusToGeneStep: # average distance of all tagging variants to gene TSS "distanceTssMean", # # minimum distance of all tagging variants to gene TSS - # "distanceTssMinimum", + "distanceTssMinimum", + # # maximum vep consequence score of the locus 95% credible set among all genes in the vicinity + "vepMaximumNeighborhood", + # # maximum vep consequence score of the locus 95% credible set split by gene + "vepMaximum", # # max clpp for each (study, locus, gene) aggregating over all eQTLs # "eqtlColocClppLocalMaximum", # # max clpp for each (study, locus) aggregating over all eQTLs diff --git a/src/otg/method/l2g/feature_factory.py b/src/otg/method/l2g/feature_factory.py index 735236959..19ba5bb88 100644 --- a/src/otg/method/l2g/feature_factory.py +++ b/src/otg/method/l2g/feature_factory.py @@ -14,6 +14,8 @@ from otg.dataset.study_locus import CredibleInterval, StudyLocus if TYPE_CHECKING: + from pyspark.sql import Column, DataFrame + from otg.dataset.colocalisation import Colocalisation from otg.dataset.study_index import StudyIndex from otg.dataset.v2g import V2G @@ -232,3 +234,127 @@ def _get_tss_distance_features( ), _schema=L2GFeature.get_schema(), ) + + @staticmethod + def _get_vep_features( + credible_set: StudyLocus, + v2g: V2G, + ) -> L2GFeature: + """Get the maximum VEP score for all variants in a locus's 95% credible set. + + This informs about functional impact of the variants in the locus. For more information on variant consequences, see: https://www.ensembl.org/info/genome/variation/prediction/predicted_data.html + Two metrics: max VEP score per study locus and gene, and max VEP score per study locus. + + + Args: + credible_set (StudyLocus): Study locus dataset with the associations to be annotated + v2g (V2G): V2G dataset with the variant/gene relationships and their consequences + + Returns: + L2GFeature: Stores the features with the max VEP score. + """ + + def _aggregate_vep_feature( + df: DataFrame, + aggregation_expr: Column, + aggregation_cols: list[str], + feature_name: str, + ) -> DataFrame: + """Extracts the maximum or average VEP score after grouping by the given columns. Different aggregations return different predictive annotations. + + If the group_cols include "geneId", the maximum/mean VEP score per gene is returned. + Otherwise, the maximum/mean VEP score for all genes in the neighborhood of the locus is returned. + + Args: + df (DataFrame): DataFrame with the VEP scores for each variant in a studyLocus + aggregation_expr (Column): Aggregation expression to apply + aggregation_cols (list[str]): Columns to group by + feature_name (str): Name of the feature to be returned + + Returns: + DataFrame: DataFrame with the maximum VEP score per locus or per locus/gene + """ + if "geneId" in aggregation_cols: + return df.groupBy(aggregation_cols).agg( + aggregation_expr.alias(feature_name) + ) + return ( + df.groupBy(aggregation_cols) + .agg( + aggregation_expr.alias(feature_name), + f.collect_set("geneId").alias("geneId"), + ) + .withColumn("geneId", f.explode("geneId")) + ) + + credible_set_w_variant_consequences = ( + credible_set.filter_credible_set(CredibleInterval.IS95) + .df.withColumn("variantInLocusId", f.explode(f.col("locus.variantId"))) + .withColumn( + "variantInLocusPosteriorProbability", + f.explode(f.col("locus.posteriorProbability")), + ) + .join( + # Join with V2G to get variant consequences + v2g.df.filter( + f.col("datasourceId") == "variantConsequence" + ).withColumnRenamed("variantId", "variantInLocusId"), + on="variantInLocusId", + ) + .withColumn( + "weightedScore", + f.col("score") * f.col("variantInLocusPosteriorProbability"), + ) + .select( + "studyLocusId", + "variantId", + "studyId", + "geneId", + "score", + "weightedScore", + ) + .distinct() + .persist() + ) + + return L2GFeature( + _df=convert_from_wide_to_long( + reduce( + lambda x, y: x.unionByName(y, allowMissingColumns=True), + [ + # Calculate overall max VEP score for all genes in the vicinity + credible_set_w_variant_consequences.transform( + _aggregate_vep_feature, + f.max("score"), + ["studyLocusId"], + "vepMaximumNeighbourhood", + ), + # Calculate overall max VEP score per gene + credible_set_w_variant_consequences.transform( + _aggregate_vep_feature, + f.max("score"), + ["studyLocusId", "geneId"], + "vepMaximum", + ), + # Calculate mean VEP score for all genes in the vicinity + credible_set_w_variant_consequences.transform( + _aggregate_vep_feature, + f.mean("weightedScore"), + ["studyLocusId"], + "vepMeanNeighbourhood", + ), + # Calculate mean VEP score per gene + credible_set_w_variant_consequences.transform( + _aggregate_vep_feature, + f.mean("weightedScore"), + ["studyLocusId", "geneId"], + "vepMean", + ), + ], + ), + id_vars=("studyLocusId", "geneId"), + var_name="featureName", + value_name="featureValue", + ).filter(f.col("featureValue").isNotNull()), + _schema=L2GFeature.get_schema(), + ) diff --git a/tests/method/test_locus_to_gene.py b/tests/method/test_locus_to_gene.py index 25fe4daed..ff7a7b8a7 100644 --- a/tests/method/test_locus_to_gene.py +++ b/tests/method/test_locus_to_gene.py @@ -133,3 +133,12 @@ def test_get_tss_distance_features( assert isinstance( tss_distance, L2GFeature ), "Unexpected model type returned from _get_tss_distance_features" + + def test_get_vep_features( + self: TestStudyLocusFactory, mock_study_locus: StudyLocus, mock_v2g: V2G + ) -> None: + """Test the function that extracts the VEP features.""" + vep_features = StudyLocusFactory._get_vep_features(mock_study_locus, mock_v2g) + assert isinstance( + vep_features, L2GFeature + ), "Unexpected model type returned from _get_vep_features" From 711f91152cf55e2892505fdf6c227699283a3e6f Mon Sep 17 00:00:00 2001 From: Kirill Tsukanov Date: Tue, 2 Jan 2024 13:01:55 +0000 Subject: [PATCH 12/21] chore: set cluster delete TTL (#379) --- src/airflow/dags/common_airflow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/airflow/dags/common_airflow.py b/src/airflow/dags/common_airflow.py index 476487e5e..a859a870e 100644 --- a/src/airflow/dags/common_airflow.py +++ b/src/airflow/dags/common_airflow.py @@ -102,7 +102,7 @@ def create_cluster( "CONFIGTAR": CONFIG_TAG, "PACKAGE": PACKAGE_WHEEL, }, - idle_delete_ttl=None, + idle_delete_ttl=30 * 60, # In seconds. autoscaling_policy=f"projects/{GCP_PROJECT}/regions/{GCP_REGION}/autoscalingPolicies/{autoscaling_policy}", ).make() From b3553d2bb24fce94eacc4e9f9a0938daa48537c1 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 2 Jan 2024 14:08:10 +0000 Subject: [PATCH 13/21] build(deps-dev): bump apache-airflow from 2.7.3 to 2.8.0 (#373) Bumps [apache-airflow](https://github.com/apache/airflow) from 2.7.3 to 2.8.0. - [Release notes](https://github.com/apache/airflow/releases) - [Changelog](https://github.com/apache/airflow/blob/main/RELEASE_NOTES.rst) - [Commits](https://github.com/apache/airflow/compare/2.7.3...2.8.0) --- updated-dependencies: - dependency-name: apache-airflow dependency-type: direct:development update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Kirill Tsukanov --- poetry.lock | 85 ++++++++++++++++++++++++++++++++------------------ pyproject.toml | 2 +- 2 files changed, 55 insertions(+), 32 deletions(-) diff --git a/poetry.lock b/poetry.lock index 21b0e56bc..d22839d91 100644 --- a/poetry.lock +++ b/poetry.lock @@ -210,13 +210,13 @@ trio = ["trio (>=0.22)"] [[package]] name = "apache-airflow" -version = "2.7.3" +version = "2.8.0" description = "Programmatically author, schedule and monitor data pipelines" optional = false python-versions = "<3.12,~=3.8" files = [ - {file = "apache-airflow-2.7.3.tar.gz", hash = "sha256:7f519eed05a047fe347a48ffcab3f78278717c0494c58e42a00f71e54594ebb3"}, - {file = "apache_airflow-2.7.3-py3-none-any.whl", hash = "sha256:b5d3cd9f39e183dd77a6589bcd98a39e31459e792053885b5f584244b5ba7655"}, + {file = "apache-airflow-2.8.0.tar.gz", hash = "sha256:5917ee148125892764f4306ec76a62c66a5801218a99edfd3fe46e968cb1a344"}, + {file = "apache_airflow-2.8.0-py3-none-any.whl", hash = "sha256:4ee532b97ef6520ad8e1ffda325aabe265f007a14166570d5f710d4999c9ad1c"}, ] [package.dependencies] @@ -230,7 +230,6 @@ argcomplete = ">=1.10" asgiref = "*" attrs = ">=22.1.0" blinker = "*" -cattrs = ">=22.1.0" colorlog = ">=4.0.2,<5.0" configupdater = ">=3.1.1" connexion = {version = ">=2.10.0,<3.0", extras = ["flask"]} @@ -240,11 +239,12 @@ cryptography = ">=0.9.3" deprecated = ">=1.2.13" dill = ">=0.2.2" flask = ">=2.2,<2.3" -flask-appbuilder = "4.3.6" +flask-appbuilder = "4.3.10" flask-caching = ">=1.5.0" flask-login = ">=0.6.2" flask-session = ">=0.4.0" flask-wtf = ">=0.15" +fsspec = ">=2023.10.0" google-re2 = ">=1.0" graphviz = ">=0.12" gunicorn = ">=20.1.0" @@ -267,7 +267,7 @@ pathspec = ">=0.9.0" pendulum = ">=2.0,<3.0" pluggy = ">=1.0" psutil = ">=4.2.0" -pydantic = ">=1.10.0" +pydantic = ">=2.3.0" pygments = ">=2.0.1" pyjwt = ">=2.0.0" python-daemon = ">=3.0.0" @@ -285,15 +285,15 @@ tenacity = ">=6.2.0,<8.2.0 || >8.2.0" termcolor = ">=1.1.0" typing-extensions = ">=4.0.0" unicodecsv = ">=0.14.1" -werkzeug = ">=2.0" -WTForms = "<3.1.0" +universal-pathlib = ">=0.1.4" +werkzeug = ">=2.0,<3" [package.extras] aiobotocore = ["aiobotocore (>=2.1.1)"] airbyte = ["apache-airflow-providers-airbyte"] alibaba = ["apache-airflow-providers-alibaba"] -all = ["PyGithub (!=1.58)", "PyOpenSSL", "adal (>=1.2.7)", "aiobotocore (>=2.1.1)", "aiohttp", "aiohttp (>=3.6.3,<4)", "alibabacloud-adb20211201 (>=1.0.0)", "alibabacloud-tea-openapi (>=0.3.7)", "amqp", "analytics-python (>=1.2.9)", "apache-airflow (>=2.4.0)", "apache-airflow (>=2.5.0)", "apache-airflow (>=2.7.0)", "apache-airflow-providers-airbyte", "apache-airflow-providers-alibaba", "apache-airflow-providers-amazon", "apache-airflow-providers-apache-beam", "apache-airflow-providers-apache-cassandra", "apache-airflow-providers-apache-drill", "apache-airflow-providers-apache-druid", "apache-airflow-providers-apache-flink", "apache-airflow-providers-apache-hdfs", "apache-airflow-providers-apache-hive", "apache-airflow-providers-apache-impala", "apache-airflow-providers-apache-kafka", "apache-airflow-providers-apache-kylin", "apache-airflow-providers-apache-livy", "apache-airflow-providers-apache-pig", "apache-airflow-providers-apache-pinot", "apache-airflow-providers-apache-spark", "apache-airflow-providers-apache-sqoop", "apache-airflow-providers-apprise", "apache-airflow-providers-arangodb", "apache-airflow-providers-asana", "apache-airflow-providers-atlassian-jira", "apache-airflow-providers-celery", "apache-airflow-providers-cloudant", "apache-airflow-providers-cncf-kubernetes", "apache-airflow-providers-common-sql", "apache-airflow-providers-daskexecutor", "apache-airflow-providers-databricks", "apache-airflow-providers-datadog", "apache-airflow-providers-dbt-cloud", "apache-airflow-providers-dingding", "apache-airflow-providers-discord", "apache-airflow-providers-docker", "apache-airflow-providers-elasticsearch", "apache-airflow-providers-exasol", "apache-airflow-providers-facebook", "apache-airflow-providers-ftp", "apache-airflow-providers-github", "apache-airflow-providers-google", "apache-airflow-providers-grpc", "apache-airflow-providers-hashicorp", "apache-airflow-providers-http", "apache-airflow-providers-imap", "apache-airflow-providers-influxdb", "apache-airflow-providers-jdbc", "apache-airflow-providers-jenkins", "apache-airflow-providers-microsoft-azure", "apache-airflow-providers-microsoft-mssql", "apache-airflow-providers-microsoft-psrp", "apache-airflow-providers-microsoft-winrm", "apache-airflow-providers-mongo", "apache-airflow-providers-mysql", "apache-airflow-providers-neo4j", "apache-airflow-providers-odbc", "apache-airflow-providers-openfaas", "apache-airflow-providers-openlineage", "apache-airflow-providers-opensearch", "apache-airflow-providers-opsgenie", "apache-airflow-providers-oracle", "apache-airflow-providers-pagerduty", "apache-airflow-providers-papermill", "apache-airflow-providers-plexus", "apache-airflow-providers-postgres", "apache-airflow-providers-presto", "apache-airflow-providers-redis", "apache-airflow-providers-salesforce", "apache-airflow-providers-samba", "apache-airflow-providers-segment", "apache-airflow-providers-sendgrid", "apache-airflow-providers-sftp", "apache-airflow-providers-singularity", "apache-airflow-providers-slack", "apache-airflow-providers-smtp", "apache-airflow-providers-snowflake", "apache-airflow-providers-sqlite", "apache-airflow-providers-ssh", "apache-airflow-providers-tableau", "apache-airflow-providers-tabular", "apache-airflow-providers-telegram", "apache-airflow-providers-trino", "apache-airflow-providers-vertica", "apache-airflow-providers-zendesk", "apache-beam (>=2.47.0)", "apprise", "arrow (>=0.16.0)", "asana (>=0.10,<4.0.0)", "asgiref", "asgiref (>=3.5.2)", "atlasclient (>=0.1.2)", "atlassian-python-api (>=1.14.2)", "attrs (>=22.2)", "authlib (>=1.0.0)", "azure-batch (>=8.0.0)", "azure-cosmos (>=4.0.0)", "azure-datalake-store (>=0.0.45)", "azure-identity (>=1.3.1)", "azure-keyvault-secrets (>=4.1.0)", "azure-kusto-data (>=4.1.0)", "azure-mgmt-containerinstance (>=9.0.0)", "azure-mgmt-containerregistry (>=8.0.0)", "azure-mgmt-datafactory (>=2.0.0)", "azure-mgmt-datalake-store (>=0.5.0)", "azure-mgmt-resource (>=2.2.0)", "azure-servicebus (>=7.6.1)", "azure-storage-blob (>=12.14.0)", "azure-storage-common (>=2.1.0)", "azure-storage-file (>=2.1.0)", "azure-storage-file-datalake (>=12.9.1)", "azure-synapse-spark", "bcrypt (>=2.0.0)", "blinker (>=1.1)", "boto3 (>=1.28.0)", "botocore (>=1.31.0)", "cassandra-driver (>=3.13.0)", "celery (>=5.3.0,!=5.3.2,!=5.3.3,<6)", "cgroupspy (>=0.2.2)", "cloudant (>=2.0)", "cloudpickle (>=1.4.1)", "confluent-kafka (>=1.8.2)", "cryptography (>=2.0.0)", "dask (>=2.9.0,!=2022.10.1,!=2023.5.0)", "databricks-sql-connector (>=2.0.0,<3.0.0)", "datadog (>=0.14.0)", "distributed (>=2.11.1,!=2023.5.0)", "dnspython (>=1.13.0)", "docker (>=5.0.3)", "elasticsearch (>8,<9)", "eventlet (>=0.33.3)", "facebook-business (>=6.0.2)", "flask-appbuilder[oauth] (==4.3.6)", "flask-bcrypt (>=0.7.1)", "flower (>=1.0.0)", "gcloud-aio-auth (>=4.0.0,<5.0.0)", "gcloud-aio-bigquery (>=6.1.2)", "gcloud-aio-storage", "gevent (>=0.13)", "google-ads (>=21.2.0)", "google-api-core (>=2.11.0)", "google-api-python-client (>=1.6.0)", "google-auth (>=1.0.0)", "google-auth (>=1.0.0,<3.0.0)", "google-auth-httplib2 (>=0.0.1)", "google-cloud-aiplatform (>=1.22.1)", "google-cloud-automl (>=2.11.0)", "google-cloud-bigquery-datatransfer (>=3.11.0)", "google-cloud-bigtable (>=2.17.0)", "google-cloud-build (>=3.13.0)", "google-cloud-compute (>=1.10.0)", "google-cloud-container (>=2.17.4)", "google-cloud-datacatalog (>=3.11.1)", "google-cloud-dataflow-client (>=0.8.2)", "google-cloud-dataform (>=0.5.0)", "google-cloud-dataplex (>=1.4.2)", "google-cloud-dataproc (>=5.4.0)", "google-cloud-dataproc-metastore (>=1.12.0)", "google-cloud-dlp (>=3.12.0)", "google-cloud-kms (>=2.15.0)", "google-cloud-language (>=2.9.0)", "google-cloud-logging (>=3.5.0)", "google-cloud-memcache (>=1.7.0)", "google-cloud-monitoring (>=2.14.1)", "google-cloud-orchestration-airflow (>=1.7.0)", "google-cloud-os-login (>=2.9.1)", "google-cloud-pubsub (>=2.15.0)", "google-cloud-redis (>=2.12.0)", "google-cloud-secret-manager (>=2.16.0)", "google-cloud-spanner (>=3.11.1)", "google-cloud-speech (>=2.18.0)", "google-cloud-storage (>=2.7.0)", "google-cloud-storage-transfer (>=1.4.1)", "google-cloud-tasks (>=2.13.0)", "google-cloud-texttospeech (>=2.14.1)", "google-cloud-translate (>=3.11.0)", "google-cloud-videointelligence (>=2.11.0)", "google-cloud-vision (>=3.4.0)", "google-cloud-workflows (>=1.10.0)", "greenlet (>=0.4.9)", "grpcio (>=1.15.0)", "grpcio-gcp (>=0.2.2)", "hdfs[avro,dataframe,kerberos] (>=2.0.4)", "hmsclient (>=0.1.0)", "httpx", "hvac (>=0.10)", "impyla (>=0.18.0,<1.0)", "influxdb-client (>=1.19.0)", "jaydebeapi (>=1.1.1)", "json-merge-patch (>=0.2)", "jsonpath-ng (>=1.5.3)", "kubernetes (>=21.7.0,<24)", "kubernetes-asyncio (>=18.20.1,<25)", "kylinpy (>=2.6)", "ldap3 (>=2.5.1)", "looker-sdk (>=22.2.0)", "mysqlclient (>=1.3.6)", "neo4j (>=4.2.1)", "openlineage-integration-common (>=0.28.0)", "openlineage-python (>=0.28.0)", "opensearch-py (>=2.2.0)", "opentelemetry-exporter-prometheus", "opsgenie-sdk (>=2.1.5)", "oracledb (>=1.0.0)", "oss2 (>=2.14.0)", "pandas (>=0.17.1)", "pandas-gbq", "papermill[all] (>=1.2.1)", "paramiko (>=2.6.0)", "pdpyras (>=4.1.2)", "pinotdb (>0.4.7)", "plyvel", "presto-python-client (>=0.8.4)", "proto-plus (>=1.19.6)", "psycopg2-binary (>=2.8.0)", "pyarrow (>=9.0.0)", "pydruid (>=0.4.1)", "pyexasol (>=0.5.1)", "pyhive[hive-pure-sasl] (>=0.7.0)", "pykerberos (>=1.1.13)", "pymongo (>=3.6.0)", "pymssql (>=2.1.5)", "pyodbc", "pypsrp (>=0.8.0)", "pyspark", "python-arango (>=7.3.2)", "python-dotenv (>=0.21.0)", "python-jenkins (>=1.0.0)", "python-ldap", "python-telegram-bot (>=20.0.0)", "pywinrm (>=0.4)", "redis (>=4.5.2,!=4.5.5,<5.0.0)", "redshift-connector (>=2.0.888)", "requests (>=2.26.0)", "requests (>=2.27,<3)", "requests-kerberos (>=0.10.0)", "requests-toolbelt", "scrapbook[all]", "sendgrid (>=6.0.0)", "sentry-sdk (>=1.32.0,!=1.33.0)", "simple-salesforce (>=1.0.0)", "slack-sdk (>=3.0.0)", "smbprotocol (>=1.5.0)", "snowflake-connector-python (>=2.4.1)", "snowflake-sqlalchemy (>=1.1.0)", "spython (>=0.0.56)", "sqlalchemy-bigquery (>=1.2.1)", "sqlalchemy-drill (>=1.1.0)", "sqlalchemy-redshift (>=0.8.6)", "sqlalchemy-spanner (>=1.6.2)", "sqlparse (>=0.4.2)", "sshtunnel (>=0.3.2)", "statsd (>=3.3.0)", "tableauserverclient", "thrift (>=0.9.2)", "thrift-sasl (>=0.2.0)", "trino (>=0.318.0)", "vertica-python (>=0.5.1)", "virtualenv", "watchtower (>=2.0.1,<2.1.0)", "zenpy (>=2.0.24)"] -all-dbs = ["aiohttp (>=3.6.3,<4)", "apache-airflow (>=2.4.0)", "apache-airflow-providers-apache-cassandra", "apache-airflow-providers-apache-drill", "apache-airflow-providers-apache-druid", "apache-airflow-providers-apache-hdfs", "apache-airflow-providers-apache-hive", "apache-airflow-providers-apache-impala", "apache-airflow-providers-apache-pinot", "apache-airflow-providers-arangodb", "apache-airflow-providers-cloudant", "apache-airflow-providers-common-sql (>=1.3.1)", "apache-airflow-providers-common-sql (>=1.5.0)", "apache-airflow-providers-databricks", "apache-airflow-providers-exasol", "apache-airflow-providers-influxdb", "apache-airflow-providers-microsoft-mssql", "apache-airflow-providers-mongo", "apache-airflow-providers-mysql", "apache-airflow-providers-neo4j", "apache-airflow-providers-postgres", "apache-airflow-providers-presto", "apache-airflow-providers-trino", "apache-airflow-providers-vertica", "cassandra-driver (>=3.13.0)", "cloudant (>=2.0)", "databricks-sql-connector (>=2.0.0,<3.0.0)", "dnspython (>=1.13.0)", "hdfs[avro,dataframe,kerberos] (>=2.0.4)", "hmsclient (>=0.1.0)", "impyla (>=0.18.0,<1.0)", "influxdb-client (>=1.19.0)", "mysqlclient (>=1.3.6)", "neo4j (>=4.2.1)", "pandas (>=0.17.1)", "pinotdb (>0.4.7)", "presto-python-client (>=0.8.4)", "psycopg2-binary (>=2.8.0)", "pydruid (>=0.4.1)", "pyexasol (>=0.5.1)", "pyhive[hive-pure-sasl] (>=0.7.0)", "pymongo (>=3.6.0)", "pymssql (>=2.1.5)", "python-arango (>=7.3.2)", "requests (>=2.26.0)", "requests (>=2.27,<3)", "sqlalchemy-drill (>=1.1.0)", "thrift (>=0.9.2)", "trino (>=0.318.0)", "vertica-python (>=0.5.1)"] +all = ["PyGithub (!=1.58)", "PyOpenSSL", "adal (>=1.2.7)", "adlfs (>=2023.10.0)", "aiobotocore (>=2.1.1)", "aiofiles (>=23.2.0)", "aiohttp", "aiohttp (>=3.6.3,<4)", "alibabacloud-adb20211201 (>=1.0.0)", "alibabacloud-tea-openapi (>=0.3.7)", "amqp", "analytics-python (>=1.2.9)", "apache-airflow (>=2.6.0)", "apache-airflow (>=2.7.0)", "apache-airflow (>=2.8.0)", "apache-airflow-providers-airbyte", "apache-airflow-providers-alibaba", "apache-airflow-providers-amazon", "apache-airflow-providers-apache-beam", "apache-airflow-providers-apache-cassandra", "apache-airflow-providers-apache-drill", "apache-airflow-providers-apache-druid", "apache-airflow-providers-apache-flink", "apache-airflow-providers-apache-hdfs", "apache-airflow-providers-apache-hive", "apache-airflow-providers-apache-impala", "apache-airflow-providers-apache-kafka", "apache-airflow-providers-apache-kylin", "apache-airflow-providers-apache-livy", "apache-airflow-providers-apache-pig", "apache-airflow-providers-apache-pinot", "apache-airflow-providers-apache-spark", "apache-airflow-providers-apprise", "apache-airflow-providers-arangodb", "apache-airflow-providers-asana", "apache-airflow-providers-atlassian-jira", "apache-airflow-providers-celery", "apache-airflow-providers-cloudant", "apache-airflow-providers-cncf-kubernetes", "apache-airflow-providers-cohere", "apache-airflow-providers-common-io", "apache-airflow-providers-common-sql", "apache-airflow-providers-databricks", "apache-airflow-providers-datadog", "apache-airflow-providers-dbt-cloud", "apache-airflow-providers-dingding", "apache-airflow-providers-discord", "apache-airflow-providers-docker", "apache-airflow-providers-elasticsearch", "apache-airflow-providers-exasol", "apache-airflow-providers-facebook", "apache-airflow-providers-ftp", "apache-airflow-providers-github", "apache-airflow-providers-google", "apache-airflow-providers-grpc", "apache-airflow-providers-hashicorp", "apache-airflow-providers-http", "apache-airflow-providers-imap", "apache-airflow-providers-influxdb", "apache-airflow-providers-jdbc", "apache-airflow-providers-jenkins", "apache-airflow-providers-microsoft-azure", "apache-airflow-providers-microsoft-mssql", "apache-airflow-providers-microsoft-psrp", "apache-airflow-providers-microsoft-winrm", "apache-airflow-providers-mongo", "apache-airflow-providers-mysql", "apache-airflow-providers-neo4j", "apache-airflow-providers-odbc", "apache-airflow-providers-openai", "apache-airflow-providers-openfaas", "apache-airflow-providers-openlineage", "apache-airflow-providers-opensearch", "apache-airflow-providers-opsgenie", "apache-airflow-providers-oracle", "apache-airflow-providers-pagerduty", "apache-airflow-providers-papermill", "apache-airflow-providers-pgvector", "apache-airflow-providers-pinecone", "apache-airflow-providers-postgres", "apache-airflow-providers-presto", "apache-airflow-providers-redis", "apache-airflow-providers-salesforce", "apache-airflow-providers-samba", "apache-airflow-providers-segment", "apache-airflow-providers-sendgrid", "apache-airflow-providers-sftp", "apache-airflow-providers-singularity", "apache-airflow-providers-slack", "apache-airflow-providers-smtp", "apache-airflow-providers-snowflake", "apache-airflow-providers-sqlite", "apache-airflow-providers-ssh", "apache-airflow-providers-tableau", "apache-airflow-providers-tabular", "apache-airflow-providers-telegram", "apache-airflow-providers-trino", "apache-airflow-providers-vertica", "apache-airflow-providers-weaviate", "apache-airflow-providers-yandex", "apache-airflow-providers-zendesk", "apache-beam (>=2.47.0)", "apprise", "asana (>=0.10,<4.0.0)", "asgiref", "asgiref (>=3.5.2)", "atlasclient (>=0.1.2)", "atlassian-python-api (>=1.14.2)", "attrs (>=22.2)", "authlib (>=1.0.0)", "azure-batch (>=8.0.0)", "azure-cosmos (>=4.0.0)", "azure-datalake-store (>=0.0.45)", "azure-identity (>=1.3.1)", "azure-keyvault-secrets (>=4.1.0)", "azure-kusto-data (>=4.1.0)", "azure-mgmt-containerinstance (>=9.0.0)", "azure-mgmt-containerregistry (>=8.0.0)", "azure-mgmt-cosmosdb", "azure-mgmt-datafactory (>=2.0.0)", "azure-mgmt-datalake-store (>=0.5.0)", "azure-mgmt-resource (>=2.2.0)", "azure-mgmt-storage (>=16.0.0)", "azure-servicebus (>=7.6.1)", "azure-storage-blob (>=12.14.0)", "azure-storage-file-datalake (>=12.9.1)", "azure-storage-file-share", "azure-synapse-artifacts (>=0.17.0)", "azure-synapse-spark", "bcrypt (>=2.0.0)", "blinker (>=1.1)", "boto3 (>=1.28.0)", "botocore (>=1.31.0)", "cassandra-driver (>=3.13.0)", "celery (>=5.3.0,!=5.3.2,!=5.3.3,<6)", "cgroupspy (>=0.2.2)", "cloudant (>=2.0)", "cohere (>=4.27)", "confluent-kafka (>=1.8.2)", "cryptography (>=2.0.0)", "databricks-sql-connector (>=2.0.0,!=2.9.0,<3.0.0)", "datadog (>=0.14.0)", "dnspython (>=1.13.0)", "docker (>=5.0.3)", "elasticsearch (>=8.10,<9)", "eventlet (>=0.33.3)", "facebook-business (>=6.0.2)", "flask-appbuilder[oauth] (==4.3.10)", "flask-bcrypt (>=0.7.1)", "flower (>=1.0.0)", "gcloud-aio-auth (>=4.0.0,<5.0.0)", "gcloud-aio-bigquery (>=6.1.2)", "gcloud-aio-storage", "gcsfs (>=2023.10.0)", "gevent (>=0.13)", "google-ads (>=22.1.0)", "google-api-core (>=2.11.0)", "google-api-python-client (>=1.6.0)", "google-auth (>=1.0.0)", "google-auth (>=1.0.0,<3.0.0)", "google-auth-httplib2 (>=0.0.1)", "google-cloud-aiplatform (>=1.22.1)", "google-cloud-automl (>=2.12.0)", "google-cloud-batch (>=0.13.0)", "google-cloud-bigquery-datatransfer (>=3.13.0)", "google-cloud-bigtable (>=2.17.0)", "google-cloud-build (>=3.22.0)", "google-cloud-compute (>=1.10.0)", "google-cloud-container (>=2.17.4)", "google-cloud-datacatalog (>=3.11.1)", "google-cloud-dataflow-client (>=0.8.6)", "google-cloud-dataform (>=0.5.0)", "google-cloud-dataplex (>=1.10.0)", "google-cloud-dataproc (>=5.8.0)", "google-cloud-dataproc-metastore (>=1.12.0)", "google-cloud-dlp (>=3.12.0)", "google-cloud-kms (>=2.15.0)", "google-cloud-language (>=2.9.0)", "google-cloud-logging (>=3.5.0)", "google-cloud-memcache (>=1.7.0)", "google-cloud-monitoring (>=2.18.0)", "google-cloud-orchestration-airflow (>=1.10.0)", "google-cloud-os-login (>=2.9.1)", "google-cloud-pubsub (>=2.19.0)", "google-cloud-redis (>=2.12.0)", "google-cloud-run (>=0.9.0)", "google-cloud-secret-manager (>=2.16.0)", "google-cloud-spanner (>=3.11.1)", "google-cloud-speech (>=2.18.0)", "google-cloud-storage (>=2.7.0)", "google-cloud-storage-transfer (>=1.4.1)", "google-cloud-tasks (>=2.13.0)", "google-cloud-texttospeech (>=2.14.1)", "google-cloud-translate (>=3.11.0)", "google-cloud-videointelligence (>=2.11.0)", "google-cloud-vision (>=3.4.0)", "google-cloud-workflows (>=1.10.0)", "google-re2 (>=1.0)", "greenlet (>=0.4.9)", "grpcio (>=1.15.0)", "grpcio-gcp (>=0.2.2)", "grpcio-status", "hdfs[avro,dataframe,kerberos] (>=2.0.4)", "hmsclient (>=0.1.0)", "httpx", "hvac (>=0.10)", "impyla (>=0.18.0,<1.0)", "inflection (>=0.5.1)", "influxdb-client (>=1.19.0)", "ipykernel", "jaydebeapi (>=1.1.1)", "json-merge-patch (>=0.2)", "jsonpath-ng (>=1.5.3)", "kubernetes (>=21.7.0,<24)", "kubernetes-asyncio (>=18.20.1,<25)", "kylinpy (>=2.6)", "ldap3 (>=2.5.1)", "looker-sdk (>=22.2.0)", "mysql-connector-python (>=8.0.11)", "mysqlclient (>=1.3.6)", "neo4j (>=4.2.1)", "openai[datalib] (>=1.0)", "openlineage-integration-common (>=0.28.0)", "openlineage-python (>=0.28.0)", "opensearch-py (>=2.2.0)", "opentelemetry-exporter-prometheus", "opsgenie-sdk (>=2.1.5)", "oracledb (>=1.0.0)", "oss2 (>=2.14.0)", "pandas (>=0.17.1)", "pandas-gbq", "papermill[all] (>=1.2.1)", "paramiko (>=2.6.0)", "paramiko (>=2.8.0)", "pdpyras (>=4.1.2)", "pgvector (>=0.2.3)", "pinecone-client (>=2.2.4)", "pinotdb (>0.4.7)", "plyvel", "presto-python-client (>=0.8.4)", "proto-plus (>=1.19.6)", "psycopg2-binary (>=2.8.0)", "pyarrow (>=9.0.0)", "pyarrow-hotfix", "pydruid (>=0.4.1)", "pyexasol (>=0.5.1)", "pyhive[hive-pure-sasl] (>=0.7.0)", "pykerberos (>=1.1.13)", "pymongo (>=3.6.0)", "pymssql (>=2.1.8)", "pyodbc", "pypsrp (>=0.8.0)", "pyspark", "python-arango (>=7.3.2)", "python-dotenv (>=0.21.0)", "python-jenkins (>=1.0.0)", "python-ldap", "python-telegram-bot (>=20.0.0)", "python3-saml (>=1.16.0)", "pywinrm (>=0.4)", "redis (>=4.5.2,!=4.5.5,<5.0.0)", "redshift-connector (>=2.0.888)", "requests (>=2.26.0)", "requests (>=2.27,<3)", "requests-kerberos (>=0.10.0)", "requests-toolbelt", "s3fs (>=2023.10.0)", "scrapbook[all]", "sendgrid (>=6.0.0)", "sentry-sdk (>=1.32.0,!=1.33.0)", "simple-salesforce (>=1.0.0)", "slack-sdk (>=3.0.0)", "smbprotocol (>=1.5.0)", "snowflake-connector-python (>=2.7.8)", "snowflake-sqlalchemy (>=1.1.0)", "spython (>=0.0.56)", "sqlalchemy-bigquery (>=1.2.1)", "sqlalchemy-drill (>=1.1.0)", "sqlalchemy-redshift (>=0.8.6)", "sqlalchemy-spanner (>=1.6.2)", "sqlparse (>=0.4.2)", "sshtunnel (>=0.3.2)", "statsd (>=3.3.0)", "tableauserverclient", "thrift (>=0.9.2)", "thrift-sasl (>=0.2.0)", "trino (>=0.318.0)", "vertica-python (>=0.5.1)", "virtualenv", "watchtower (>=2.0.1,<4)", "weaviate-client (>=3.24.2)", "yandexcloud (>=0.228.0)", "zenpy (>=2.0.24)"] +all-dbs = ["aiohttp (>=3.6.3,<4)", "apache-airflow (>=2.6.0)", "apache-airflow-providers-apache-cassandra", "apache-airflow-providers-apache-drill", "apache-airflow-providers-apache-druid", "apache-airflow-providers-apache-hdfs", "apache-airflow-providers-apache-hive", "apache-airflow-providers-apache-impala", "apache-airflow-providers-apache-pinot", "apache-airflow-providers-arangodb", "apache-airflow-providers-cloudant", "apache-airflow-providers-common-sql (>=1.3.1)", "apache-airflow-providers-common-sql (>=1.8.1)", "apache-airflow-providers-databricks", "apache-airflow-providers-exasol", "apache-airflow-providers-influxdb", "apache-airflow-providers-microsoft-mssql", "apache-airflow-providers-mongo", "apache-airflow-providers-mysql", "apache-airflow-providers-neo4j", "apache-airflow-providers-postgres", "apache-airflow-providers-presto", "apache-airflow-providers-trino", "apache-airflow-providers-vertica", "cassandra-driver (>=3.13.0)", "cloudant (>=2.0)", "databricks-sql-connector (>=2.0.0,!=2.9.0,<3.0.0)", "dnspython (>=1.13.0)", "hdfs[avro,dataframe,kerberos] (>=2.0.4)", "hmsclient (>=0.1.0)", "impyla (>=0.18.0,<1.0)", "influxdb-client (>=1.19.0)", "mysql-connector-python (>=8.0.11)", "mysqlclient (>=1.3.6)", "neo4j (>=4.2.1)", "pandas (>=0.17.1)", "pinotdb (>0.4.7)", "presto-python-client (>=0.8.4)", "psycopg2-binary (>=2.8.0)", "pydruid (>=0.4.1)", "pyexasol (>=0.5.1)", "pyhive[hive-pure-sasl] (>=0.7.0)", "pymongo (>=3.6.0)", "pymssql (>=2.1.8)", "python-arango (>=7.3.2)", "requests (>=2.26.0)", "requests (>=2.27,<3)", "sqlalchemy-drill (>=1.1.0)", "thrift (>=0.9.2)", "trino (>=0.318.0)", "vertica-python (>=0.5.1)"] amazon = ["apache-airflow-providers-amazon"] apache-atlas = ["atlasclient (>=0.1.2)"] apache-beam = ["apache-airflow-providers-apache-beam"] @@ -310,7 +310,6 @@ apache-livy = ["apache-airflow-providers-apache-livy"] apache-pig = ["apache-airflow-providers-apache-pig"] apache-pinot = ["apache-airflow-providers-apache-pinot"] apache-spark = ["apache-airflow-providers-apache-spark"] -apache-sqoop = ["apache-airflow-providers-apache-sqoop"] apache-webhdfs = ["hdfs[avro,dataframe,kerberos] (>=2.0.4)"] apprise = ["apache-airflow-providers-apprise"] arangodb = ["apache-airflow-providers-arangodb"] @@ -321,24 +320,24 @@ atlassian-jira = ["apache-airflow-providers-atlassian-jira"] aws = ["apache-airflow-providers-amazon"] azure = ["apache-airflow-providers-microsoft-azure"] cassandra = ["apache-airflow-providers-apache-cassandra"] -celery = ["apache-airflow (>=2.4.0)", "apache-airflow-providers-celery", "celery (>=5.3.0,!=5.3.2,!=5.3.3,<6)", "flower (>=1.0.0)"] +celery = ["apache-airflow (>=2.6.0)", "apache-airflow-providers-celery", "celery (>=5.3.0,!=5.3.2,!=5.3.3,<6)", "flower (>=1.0.0)", "google-re2 (>=1.0)"] cgroups = ["cgroupspy (>=0.2.2)"] cloudant = ["apache-airflow-providers-cloudant"] -cncf-kubernetes = ["apache-airflow (>=2.4.0)", "apache-airflow-providers-cncf-kubernetes", "asgiref (>=3.5.2)", "cryptography (>=2.0.0)", "kubernetes (>=21.7.0,<24)", "kubernetes-asyncio (>=18.20.1,<25)"] +cncf-kubernetes = ["aiofiles (>=23.2.0)", "apache-airflow (>=2.6.0)", "apache-airflow-providers-cncf-kubernetes", "asgiref (>=3.5.2)", "cryptography (>=2.0.0)", "google-re2 (>=1.0)", "kubernetes (>=21.7.0,<24)", "kubernetes-asyncio (>=18.20.1,<25)"] +cohere = ["apache-airflow-providers-cohere"] +common-io = ["apache-airflow-providers-common-io"] common-sql = ["apache-airflow-providers-common-sql"] -dask = ["apache-airflow (>=2.4.0)", "apache-airflow-providers-daskexecutor", "cloudpickle (>=1.4.1)", "dask (>=2.9.0,!=2022.10.1,!=2023.5.0)", "distributed (>=2.11.1,!=2023.5.0)"] -daskexecutor = ["apache-airflow (>=2.4.0)", "apache-airflow-providers-daskexecutor", "cloudpickle (>=1.4.1)", "dask (>=2.9.0,!=2022.10.1,!=2023.5.0)", "distributed (>=2.11.1,!=2023.5.0)"] databricks = ["apache-airflow-providers-databricks"] datadog = ["apache-airflow-providers-datadog"] dbt-cloud = ["apache-airflow-providers-dbt-cloud"] deprecated-api = ["requests (>=2.26.0)"] -devel = ["aiobotocore (>=2.1.1)", "aioresponses", "apache-airflow (>=2.4.0)", "apache-airflow-providers-common-sql", "astroid (>=2.12.3,<3.0)", "aws-xray-sdk", "backports.zoneinfo (>=0.2.1)", "bcrypt (>=2.0.0)", "beautifulsoup4 (>=4.7.1)", "black", "blinker", "cgroupspy (>=0.2.2)", "checksumdir", "click (>=8.0)", "click (>=8.0,!=8.1.4,!=8.1.5)", "coverage (>=7.2)", "cryptography (>=2.0.0)", "docutils (<0.17.0)", "duckdb (>=0.9.0)", "eralchemy2", "filelock", "flask-bcrypt (>=0.7.1)", "gitpython", "ipdb", "kubernetes (>=21.7.0,<24)", "mongomock", "moto[cloudformation,glue] (>=4.2.5)", "mypy (==1.2.0)", "mypy-boto3-appflow (>=1.28.0)", "mypy-boto3-rds (>=1.28.0)", "mypy-boto3-redshift-data (>=1.28.0)", "mypy-boto3-s3 (>=1.28.0)", "mysqlclient (>=1.3.6)", "pandas (>=0.17.1)", "pipdeptree", "pre-commit", "pyarrow (>=9.0.0)", "pygithub", "pytest", "pytest-asyncio", "pytest-cov", "pytest-httpx", "pytest-instafail", "pytest-mock", "pytest-rerunfailures", "pytest-timeouts", "pytest-xdist", "pywinrm", "requests-mock", "rich-click (>=1.7.0)", "ruff (>=0.0.219)", "semver", "sphinx (>=5.2.0)", "sphinx-airflow-theme", "sphinx-argparse (>=0.1.13)", "sphinx-autoapi (>=2.0.0)", "sphinx-copybutton", "sphinx-jinja (>=2.0)", "sphinx-rtd-theme (>=0.1.6)", "sphinxcontrib-httpdomain (>=1.7.0)", "sphinxcontrib-redoc (>=1.6.0)", "sphinxcontrib-spelling (>=7.3)", "time-machine", "towncrier", "twine", "types-Deprecated", "types-Markdown", "types-PyMySQL", "types-PyYAML", "types-certifi", "types-croniter", "types-docutils", "types-paramiko", "types-protobuf", "types-python-dateutil", "types-python-slugify", "types-pytz", "types-redis", "types-requests", "types-setuptools", "types-tabulate", "types-termcolor", "types-toml", "wheel", "yamllint"] -devel-all = ["PyGithub (!=1.58)", "PyOpenSSL", "adal (>=1.2.7)", "aiobotocore (>=2.1.1)", "aiohttp", "aiohttp (>=3.6.3,<4)", "aioresponses", "alibabacloud-adb20211201 (>=1.0.0)", "alibabacloud-tea-openapi (>=0.3.7)", "amqp", "analytics-python (>=1.2.9)", "apache-airflow (>=2.4.0)", "apache-airflow (>=2.5.0)", "apache-airflow (>=2.7.0)", "apache-airflow-providers-airbyte", "apache-airflow-providers-alibaba", "apache-airflow-providers-amazon", "apache-airflow-providers-apache-beam", "apache-airflow-providers-apache-cassandra", "apache-airflow-providers-apache-drill", "apache-airflow-providers-apache-druid", "apache-airflow-providers-apache-flink", "apache-airflow-providers-apache-hdfs", "apache-airflow-providers-apache-hive", "apache-airflow-providers-apache-impala", "apache-airflow-providers-apache-kafka", "apache-airflow-providers-apache-kylin", "apache-airflow-providers-apache-livy", "apache-airflow-providers-apache-pig", "apache-airflow-providers-apache-pinot", "apache-airflow-providers-apache-spark", "apache-airflow-providers-apache-sqoop", "apache-airflow-providers-apprise", "apache-airflow-providers-arangodb", "apache-airflow-providers-asana", "apache-airflow-providers-atlassian-jira", "apache-airflow-providers-celery", "apache-airflow-providers-cloudant", "apache-airflow-providers-cncf-kubernetes", "apache-airflow-providers-common-sql", "apache-airflow-providers-daskexecutor", "apache-airflow-providers-databricks", "apache-airflow-providers-datadog", "apache-airflow-providers-dbt-cloud", "apache-airflow-providers-dingding", "apache-airflow-providers-discord", "apache-airflow-providers-docker", "apache-airflow-providers-elasticsearch", "apache-airflow-providers-exasol", "apache-airflow-providers-facebook", "apache-airflow-providers-ftp", "apache-airflow-providers-github", "apache-airflow-providers-google", "apache-airflow-providers-grpc", "apache-airflow-providers-hashicorp", "apache-airflow-providers-http", "apache-airflow-providers-imap", "apache-airflow-providers-influxdb", "apache-airflow-providers-jdbc", "apache-airflow-providers-jenkins", "apache-airflow-providers-microsoft-azure", "apache-airflow-providers-microsoft-mssql", "apache-airflow-providers-microsoft-psrp", "apache-airflow-providers-microsoft-winrm", "apache-airflow-providers-mongo", "apache-airflow-providers-mysql", "apache-airflow-providers-neo4j", "apache-airflow-providers-odbc", "apache-airflow-providers-openfaas", "apache-airflow-providers-openlineage", "apache-airflow-providers-opensearch", "apache-airflow-providers-opsgenie", "apache-airflow-providers-oracle", "apache-airflow-providers-pagerduty", "apache-airflow-providers-papermill", "apache-airflow-providers-plexus", "apache-airflow-providers-postgres", "apache-airflow-providers-presto", "apache-airflow-providers-redis", "apache-airflow-providers-salesforce", "apache-airflow-providers-samba", "apache-airflow-providers-segment", "apache-airflow-providers-sendgrid", "apache-airflow-providers-sftp", "apache-airflow-providers-singularity", "apache-airflow-providers-slack", "apache-airflow-providers-smtp", "apache-airflow-providers-snowflake", "apache-airflow-providers-sqlite", "apache-airflow-providers-ssh", "apache-airflow-providers-tableau", "apache-airflow-providers-tabular", "apache-airflow-providers-telegram", "apache-airflow-providers-trino", "apache-airflow-providers-vertica", "apache-airflow-providers-zendesk", "apache-beam (>=2.47.0)", "apprise", "arrow (>=0.16.0)", "asana (>=0.10,<4.0.0)", "asgiref", "asgiref (>=3.5.2)", "astroid (>=2.12.3,<3.0)", "atlasclient (>=0.1.2)", "atlassian-python-api (>=1.14.2)", "attrs (>=22.2)", "authlib (>=1.0.0)", "aws-xray-sdk", "azure-batch (>=8.0.0)", "azure-cosmos (>=4.0.0)", "azure-datalake-store (>=0.0.45)", "azure-identity (>=1.3.1)", "azure-keyvault-secrets (>=4.1.0)", "azure-kusto-data (>=4.1.0)", "azure-mgmt-containerinstance (>=9.0.0)", "azure-mgmt-containerregistry (>=8.0.0)", "azure-mgmt-datafactory (>=2.0.0)", "azure-mgmt-datalake-store (>=0.5.0)", "azure-mgmt-resource (>=2.2.0)", "azure-servicebus (>=7.6.1)", "azure-storage-blob (>=12.14.0)", "azure-storage-common (>=2.1.0)", "azure-storage-file (>=2.1.0)", "azure-storage-file-datalake (>=12.9.1)", "azure-synapse-spark", "backports.zoneinfo (>=0.2.1)", "bcrypt (>=2.0.0)", "beautifulsoup4 (>=4.7.1)", "black", "blinker", "blinker (>=1.1)", "boto3 (>=1.28.0)", "botocore (>=1.31.0)", "cassandra-driver (>=3.13.0)", "celery (>=5.3.0,!=5.3.2,!=5.3.3,<6)", "cgroupspy (>=0.2.2)", "checksumdir", "click (>=8.0)", "click (>=8.0,!=8.1.4,!=8.1.5)", "cloudant (>=2.0)", "cloudpickle (>=1.4.1)", "confluent-kafka (>=1.8.2)", "coverage (>=7.2)", "cryptography (>=2.0.0)", "dask (>=2.9.0,!=2022.10.1,!=2023.5.0)", "databricks-sql-connector (>=2.0.0,<3.0.0)", "datadog (>=0.14.0)", "distributed (>=2.11.1,!=2023.5.0)", "dnspython (>=1.13.0)", "docker (>=5.0.3)", "docutils (<0.17.0)", "duckdb (>=0.9.0)", "elasticsearch (>8,<9)", "eralchemy2", "eventlet (>=0.33.3)", "facebook-business (>=6.0.2)", "filelock", "flask-appbuilder[oauth] (==4.3.6)", "flask-bcrypt (>=0.7.1)", "flower (>=1.0.0)", "gcloud-aio-auth (>=4.0.0,<5.0.0)", "gcloud-aio-bigquery (>=6.1.2)", "gcloud-aio-storage", "gevent (>=0.13)", "gitpython", "google-ads (>=21.2.0)", "google-api-core (>=2.11.0)", "google-api-python-client (>=1.6.0)", "google-auth (>=1.0.0)", "google-auth (>=1.0.0,<3.0.0)", "google-auth-httplib2 (>=0.0.1)", "google-cloud-aiplatform (>=1.22.1)", "google-cloud-automl (>=2.11.0)", "google-cloud-bigquery-datatransfer (>=3.11.0)", "google-cloud-bigtable (>=2.17.0)", "google-cloud-build (>=3.13.0)", "google-cloud-compute (>=1.10.0)", "google-cloud-container (>=2.17.4)", "google-cloud-datacatalog (>=3.11.1)", "google-cloud-dataflow-client (>=0.8.2)", "google-cloud-dataform (>=0.5.0)", "google-cloud-dataplex (>=1.4.2)", "google-cloud-dataproc (>=5.4.0)", "google-cloud-dataproc-metastore (>=1.12.0)", "google-cloud-dlp (>=3.12.0)", "google-cloud-kms (>=2.15.0)", "google-cloud-language (>=2.9.0)", "google-cloud-logging (>=3.5.0)", "google-cloud-memcache (>=1.7.0)", "google-cloud-monitoring (>=2.14.1)", "google-cloud-orchestration-airflow (>=1.7.0)", "google-cloud-os-login (>=2.9.1)", "google-cloud-pubsub (>=2.15.0)", "google-cloud-redis (>=2.12.0)", "google-cloud-secret-manager (>=2.16.0)", "google-cloud-spanner (>=3.11.1)", "google-cloud-speech (>=2.18.0)", "google-cloud-storage (>=2.7.0)", "google-cloud-storage-transfer (>=1.4.1)", "google-cloud-tasks (>=2.13.0)", "google-cloud-texttospeech (>=2.14.1)", "google-cloud-translate (>=3.11.0)", "google-cloud-videointelligence (>=2.11.0)", "google-cloud-vision (>=3.4.0)", "google-cloud-workflows (>=1.10.0)", "greenlet (>=0.4.9)", "grpcio (>=1.15.0)", "grpcio-gcp (>=0.2.2)", "hdfs[avro,dataframe,kerberos] (>=2.0.4)", "hmsclient (>=0.1.0)", "httpx", "hvac (>=0.10)", "impyla (>=0.18.0,<1.0)", "influxdb-client (>=1.19.0)", "ipdb", "jaydebeapi (>=1.1.1)", "json-merge-patch (>=0.2)", "jsonpath-ng (>=1.5.3)", "kubernetes (>=21.7.0,<24)", "kubernetes-asyncio (>=18.20.1,<25)", "kylinpy (>=2.6)", "ldap3 (>=2.5.1)", "looker-sdk (>=22.2.0)", "mongomock", "moto[cloudformation,glue] (>=4.2.5)", "mypy (==1.2.0)", "mypy-boto3-appflow (>=1.28.0)", "mypy-boto3-rds (>=1.28.0)", "mypy-boto3-redshift-data (>=1.28.0)", "mypy-boto3-s3 (>=1.28.0)", "mysqlclient (>=1.3.6)", "neo4j (>=4.2.1)", "openlineage-integration-common (>=0.28.0)", "openlineage-python (>=0.28.0)", "opensearch-py (>=2.2.0)", "opentelemetry-exporter-prometheus", "opsgenie-sdk (>=2.1.5)", "oracledb (>=1.0.0)", "oss2 (>=2.14.0)", "pandas (>=0.17.1)", "pandas-gbq", "papermill[all] (>=1.2.1)", "paramiko (>=2.6.0)", "pdpyras (>=4.1.2)", "pinotdb (>0.4.7)", "pipdeptree", "plyvel", "pre-commit", "presto-python-client (>=0.8.4)", "proto-plus (>=1.19.6)", "psycopg2-binary (>=2.8.0)", "pyarrow (>=9.0.0)", "pydruid (>=0.4.1)", "pyexasol (>=0.5.1)", "pygithub", "pyhive[hive-pure-sasl] (>=0.7.0)", "pykerberos (>=1.1.13)", "pymongo (>=3.6.0)", "pymssql (>=2.1.5)", "pyodbc", "pypsrp (>=0.8.0)", "pyspark", "pytest", "pytest-asyncio", "pytest-cov", "pytest-httpx", "pytest-instafail", "pytest-mock", "pytest-rerunfailures", "pytest-timeouts", "pytest-xdist", "python-arango (>=7.3.2)", "python-dotenv (>=0.21.0)", "python-jenkins (>=1.0.0)", "python-ldap", "python-telegram-bot (>=20.0.0)", "pywinrm", "pywinrm (>=0.4)", "redis (>=4.5.2,!=4.5.5,<5.0.0)", "redshift-connector (>=2.0.888)", "requests (>=2.26.0)", "requests (>=2.27,<3)", "requests-kerberos (>=0.10.0)", "requests-mock", "requests-toolbelt", "rich-click (>=1.7.0)", "ruff (>=0.0.219)", "scrapbook[all]", "semver", "sendgrid (>=6.0.0)", "sentry-sdk (>=1.32.0,!=1.33.0)", "simple-salesforce (>=1.0.0)", "slack-sdk (>=3.0.0)", "smbprotocol (>=1.5.0)", "snowflake-connector-python (>=2.4.1)", "snowflake-sqlalchemy (>=1.1.0)", "sphinx (>=5.2.0)", "sphinx-airflow-theme", "sphinx-argparse (>=0.1.13)", "sphinx-autoapi (>=2.0.0)", "sphinx-copybutton", "sphinx-jinja (>=2.0)", "sphinx-rtd-theme (>=0.1.6)", "sphinxcontrib-httpdomain (>=1.7.0)", "sphinxcontrib-redoc (>=1.6.0)", "sphinxcontrib-spelling (>=7.3)", "spython (>=0.0.56)", "sqlalchemy-bigquery (>=1.2.1)", "sqlalchemy-drill (>=1.1.0)", "sqlalchemy-redshift (>=0.8.6)", "sqlalchemy-spanner (>=1.6.2)", "sqlparse (>=0.4.2)", "sshtunnel (>=0.3.2)", "statsd (>=3.3.0)", "tableauserverclient", "thrift (>=0.9.2)", "thrift-sasl (>=0.2.0)", "time-machine", "towncrier", "trino (>=0.318.0)", "twine", "types-Deprecated", "types-Markdown", "types-PyMySQL", "types-PyYAML", "types-certifi", "types-croniter", "types-docutils", "types-paramiko", "types-protobuf", "types-python-dateutil", "types-python-slugify", "types-pytz", "types-redis", "types-requests", "types-setuptools", "types-tabulate", "types-termcolor", "types-toml", "vertica-python (>=0.5.1)", "virtualenv", "watchtower (>=2.0.1,<2.1.0)", "wheel", "yamllint", "zenpy (>=2.0.24)"] -devel-ci = ["PyGithub (!=1.58)", "PyOpenSSL", "adal (>=1.2.7)", "aiobotocore (>=2.1.1)", "aiohttp", "aiohttp (>=3.6.3,<4)", "aioresponses", "alibabacloud-adb20211201 (>=1.0.0)", "alibabacloud-tea-openapi (>=0.3.7)", "amqp", "analytics-python (>=1.2.9)", "apache-airflow (>=2.4.0)", "apache-airflow (>=2.5.0)", "apache-airflow (>=2.7.0)", "apache-airflow-providers-airbyte", "apache-airflow-providers-alibaba", "apache-airflow-providers-amazon", "apache-airflow-providers-apache-beam", "apache-airflow-providers-apache-cassandra", "apache-airflow-providers-apache-drill", "apache-airflow-providers-apache-druid", "apache-airflow-providers-apache-flink", "apache-airflow-providers-apache-hdfs", "apache-airflow-providers-apache-hive", "apache-airflow-providers-apache-impala", "apache-airflow-providers-apache-kafka", "apache-airflow-providers-apache-kylin", "apache-airflow-providers-apache-livy", "apache-airflow-providers-apache-pig", "apache-airflow-providers-apache-pinot", "apache-airflow-providers-apache-spark", "apache-airflow-providers-apache-sqoop", "apache-airflow-providers-apprise", "apache-airflow-providers-arangodb", "apache-airflow-providers-asana", "apache-airflow-providers-atlassian-jira", "apache-airflow-providers-celery", "apache-airflow-providers-cloudant", "apache-airflow-providers-cncf-kubernetes", "apache-airflow-providers-common-sql", "apache-airflow-providers-daskexecutor", "apache-airflow-providers-databricks", "apache-airflow-providers-datadog", "apache-airflow-providers-dbt-cloud", "apache-airflow-providers-dingding", "apache-airflow-providers-discord", "apache-airflow-providers-docker", "apache-airflow-providers-elasticsearch", "apache-airflow-providers-exasol", "apache-airflow-providers-facebook", "apache-airflow-providers-ftp", "apache-airflow-providers-github", "apache-airflow-providers-google", "apache-airflow-providers-grpc", "apache-airflow-providers-hashicorp", "apache-airflow-providers-http", "apache-airflow-providers-imap", "apache-airflow-providers-influxdb", "apache-airflow-providers-jdbc", "apache-airflow-providers-jenkins", "apache-airflow-providers-microsoft-azure", "apache-airflow-providers-microsoft-mssql", "apache-airflow-providers-microsoft-psrp", "apache-airflow-providers-microsoft-winrm", "apache-airflow-providers-mongo", "apache-airflow-providers-mysql", "apache-airflow-providers-neo4j", "apache-airflow-providers-odbc", "apache-airflow-providers-openfaas", "apache-airflow-providers-openlineage", "apache-airflow-providers-opensearch", "apache-airflow-providers-opsgenie", "apache-airflow-providers-oracle", "apache-airflow-providers-pagerduty", "apache-airflow-providers-papermill", "apache-airflow-providers-plexus", "apache-airflow-providers-postgres", "apache-airflow-providers-presto", "apache-airflow-providers-redis", "apache-airflow-providers-salesforce", "apache-airflow-providers-samba", "apache-airflow-providers-segment", "apache-airflow-providers-sendgrid", "apache-airflow-providers-sftp", "apache-airflow-providers-singularity", "apache-airflow-providers-slack", "apache-airflow-providers-smtp", "apache-airflow-providers-snowflake", "apache-airflow-providers-sqlite", "apache-airflow-providers-ssh", "apache-airflow-providers-tableau", "apache-airflow-providers-tabular", "apache-airflow-providers-telegram", "apache-airflow-providers-trino", "apache-airflow-providers-vertica", "apache-airflow-providers-zendesk", "apache-beam (>=2.47.0)", "apprise", "arrow (>=0.16.0)", "asana (>=0.10,<4.0.0)", "asgiref", "asgiref (>=3.5.2)", "astroid (>=2.12.3,<3.0)", "atlasclient (>=0.1.2)", "atlassian-python-api (>=1.14.2)", "attrs (>=22.2)", "authlib (>=1.0.0)", "aws-xray-sdk", "azure-batch (>=8.0.0)", "azure-cosmos (>=4.0.0)", "azure-datalake-store (>=0.0.45)", "azure-identity (>=1.3.1)", "azure-keyvault-secrets (>=4.1.0)", "azure-kusto-data (>=4.1.0)", "azure-mgmt-containerinstance (>=9.0.0)", "azure-mgmt-containerregistry (>=8.0.0)", "azure-mgmt-datafactory (>=2.0.0)", "azure-mgmt-datalake-store (>=0.5.0)", "azure-mgmt-resource (>=2.2.0)", "azure-servicebus (>=7.6.1)", "azure-storage-blob (>=12.14.0)", "azure-storage-common (>=2.1.0)", "azure-storage-file (>=2.1.0)", "azure-storage-file-datalake (>=12.9.1)", "azure-synapse-spark", "backports.zoneinfo (>=0.2.1)", "bcrypt (>=2.0.0)", "beautifulsoup4 (>=4.7.1)", "black", "blinker", "blinker (>=1.1)", "boto3 (>=1.28.0)", "botocore (>=1.31.0)", "cassandra-driver (>=3.13.0)", "celery (>=5.3.0,!=5.3.2,!=5.3.3,<6)", "cgroupspy (>=0.2.2)", "checksumdir", "click (>=8.0)", "click (>=8.0,!=8.1.4,!=8.1.5)", "cloudant (>=2.0)", "cloudpickle (>=1.4.1)", "confluent-kafka (>=1.8.2)", "coverage (>=7.2)", "cryptography (>=2.0.0)", "dask (>=2.9.0,!=2022.10.1,!=2023.5.0)", "databricks-sql-connector (>=2.0.0,<3.0.0)", "datadog (>=0.14.0)", "distributed (>=2.11.1,!=2023.5.0)", "dnspython (>=1.13.0)", "docker (>=5.0.3)", "docutils (<0.17.0)", "duckdb (>=0.9.0)", "elasticsearch (>8,<9)", "eralchemy2", "eventlet (>=0.33.3)", "facebook-business (>=6.0.2)", "filelock", "flask-appbuilder[oauth] (==4.3.6)", "flask-bcrypt (>=0.7.1)", "flower (>=1.0.0)", "gcloud-aio-auth (>=4.0.0,<5.0.0)", "gcloud-aio-bigquery (>=6.1.2)", "gcloud-aio-storage", "gevent (>=0.13)", "gitpython", "google-ads (>=21.2.0)", "google-api-core (>=2.11.0)", "google-api-python-client (>=1.6.0)", "google-auth (>=1.0.0)", "google-auth (>=1.0.0,<3.0.0)", "google-auth-httplib2 (>=0.0.1)", "google-cloud-aiplatform (>=1.22.1)", "google-cloud-automl (>=2.11.0)", "google-cloud-bigquery-datatransfer (>=3.11.0)", "google-cloud-bigtable (>=2.17.0)", "google-cloud-build (>=3.13.0)", "google-cloud-compute (>=1.10.0)", "google-cloud-container (>=2.17.4)", "google-cloud-datacatalog (>=3.11.1)", "google-cloud-dataflow-client (>=0.8.2)", "google-cloud-dataform (>=0.5.0)", "google-cloud-dataplex (>=1.4.2)", "google-cloud-dataproc (>=5.4.0)", "google-cloud-dataproc-metastore (>=1.12.0)", "google-cloud-dlp (>=3.12.0)", "google-cloud-kms (>=2.15.0)", "google-cloud-language (>=2.9.0)", "google-cloud-logging (>=3.5.0)", "google-cloud-memcache (>=1.7.0)", "google-cloud-monitoring (>=2.14.1)", "google-cloud-orchestration-airflow (>=1.7.0)", "google-cloud-os-login (>=2.9.1)", "google-cloud-pubsub (>=2.15.0)", "google-cloud-redis (>=2.12.0)", "google-cloud-secret-manager (>=2.16.0)", "google-cloud-spanner (>=3.11.1)", "google-cloud-speech (>=2.18.0)", "google-cloud-storage (>=2.7.0)", "google-cloud-storage-transfer (>=1.4.1)", "google-cloud-tasks (>=2.13.0)", "google-cloud-texttospeech (>=2.14.1)", "google-cloud-translate (>=3.11.0)", "google-cloud-videointelligence (>=2.11.0)", "google-cloud-vision (>=3.4.0)", "google-cloud-workflows (>=1.10.0)", "greenlet (>=0.4.9)", "grpcio (>=1.15.0)", "grpcio-gcp (>=0.2.2)", "hdfs[avro,dataframe,kerberos] (>=2.0.4)", "hmsclient (>=0.1.0)", "httpx", "hvac (>=0.10)", "impyla (>=0.18.0,<1.0)", "influxdb-client (>=1.19.0)", "ipdb", "jaydebeapi (>=1.1.1)", "json-merge-patch (>=0.2)", "jsonpath-ng (>=1.5.3)", "kubernetes (>=21.7.0,<24)", "kubernetes-asyncio (>=18.20.1,<25)", "kylinpy (>=2.6)", "ldap3 (>=2.5.1)", "looker-sdk (>=22.2.0)", "mongomock", "moto[cloudformation,glue] (>=4.2.5)", "mypy (==1.2.0)", "mypy-boto3-appflow (>=1.28.0)", "mypy-boto3-rds (>=1.28.0)", "mypy-boto3-redshift-data (>=1.28.0)", "mypy-boto3-s3 (>=1.28.0)", "mysqlclient (>=1.3.6)", "neo4j (>=4.2.1)", "openlineage-integration-common (>=0.28.0)", "openlineage-python (>=0.28.0)", "opensearch-py (>=2.2.0)", "opentelemetry-exporter-prometheus", "opsgenie-sdk (>=2.1.5)", "oracledb (>=1.0.0)", "oss2 (>=2.14.0)", "pandas (>=0.17.1)", "pandas-gbq", "papermill[all] (>=1.2.1)", "paramiko (>=2.6.0)", "pdpyras (>=4.1.2)", "pinotdb (>0.4.7)", "pipdeptree", "plyvel", "pre-commit", "presto-python-client (>=0.8.4)", "proto-plus (>=1.19.6)", "psycopg2-binary (>=2.8.0)", "pyarrow (>=9.0.0)", "pydruid (>=0.4.1)", "pyexasol (>=0.5.1)", "pygithub", "pyhive[hive-pure-sasl] (>=0.7.0)", "pykerberos (>=1.1.13)", "pymongo (>=3.6.0)", "pymssql (>=2.1.5)", "pyodbc", "pypsrp (>=0.8.0)", "pyspark", "pytest", "pytest-asyncio", "pytest-cov", "pytest-httpx", "pytest-instafail", "pytest-mock", "pytest-rerunfailures", "pytest-timeouts", "pytest-xdist", "python-arango (>=7.3.2)", "python-dotenv (>=0.21.0)", "python-jenkins (>=1.0.0)", "python-ldap", "python-telegram-bot (>=20.0.0)", "pywinrm", "pywinrm (>=0.4)", "redis (>=4.5.2,!=4.5.5,<5.0.0)", "redshift-connector (>=2.0.888)", "requests (>=2.26.0)", "requests (>=2.27,<3)", "requests-kerberos (>=0.10.0)", "requests-mock", "requests-toolbelt", "rich-click (>=1.7.0)", "ruff (>=0.0.219)", "scrapbook[all]", "semver", "sendgrid (>=6.0.0)", "sentry-sdk (>=1.32.0,!=1.33.0)", "simple-salesforce (>=1.0.0)", "slack-sdk (>=3.0.0)", "smbprotocol (>=1.5.0)", "snowflake-connector-python (>=2.4.1)", "snowflake-sqlalchemy (>=1.1.0)", "sphinx (>=5.2.0)", "sphinx-airflow-theme", "sphinx-argparse (>=0.1.13)", "sphinx-autoapi (>=2.0.0)", "sphinx-copybutton", "sphinx-jinja (>=2.0)", "sphinx-rtd-theme (>=0.1.6)", "sphinxcontrib-httpdomain (>=1.7.0)", "sphinxcontrib-redoc (>=1.6.0)", "sphinxcontrib-spelling (>=7.3)", "spython (>=0.0.56)", "sqlalchemy-bigquery (>=1.2.1)", "sqlalchemy-drill (>=1.1.0)", "sqlalchemy-redshift (>=0.8.6)", "sqlalchemy-spanner (>=1.6.2)", "sqlparse (>=0.4.2)", "sshtunnel (>=0.3.2)", "statsd (>=3.3.0)", "tableauserverclient", "thrift (>=0.9.2)", "thrift-sasl (>=0.2.0)", "time-machine", "towncrier", "trino (>=0.318.0)", "twine", "types-Deprecated", "types-Markdown", "types-PyMySQL", "types-PyYAML", "types-certifi", "types-croniter", "types-docutils", "types-paramiko", "types-protobuf", "types-python-dateutil", "types-python-slugify", "types-pytz", "types-redis", "types-requests", "types-setuptools", "types-tabulate", "types-termcolor", "types-toml", "vertica-python (>=0.5.1)", "virtualenv", "watchtower (>=2.0.1,<2.1.0)", "wheel", "yamllint", "zenpy (>=2.0.24)"] -devel-hadoop = ["aiobotocore (>=2.1.1)", "aioresponses", "apache-airflow (>=2.4.0)", "apache-airflow-providers-apache-hdfs", "apache-airflow-providers-apache-hive", "apache-airflow-providers-common-sql", "apache-airflow-providers-presto", "apache-airflow-providers-trino", "astroid (>=2.12.3,<3.0)", "aws-xray-sdk", "backports.zoneinfo (>=0.2.1)", "bcrypt (>=2.0.0)", "beautifulsoup4 (>=4.7.1)", "black", "blinker", "cgroupspy (>=0.2.2)", "checksumdir", "click (>=8.0)", "click (>=8.0,!=8.1.4,!=8.1.5)", "coverage (>=7.2)", "cryptography (>=2.0.0)", "docutils (<0.17.0)", "duckdb (>=0.9.0)", "eralchemy2", "filelock", "flask-bcrypt (>=0.7.1)", "gitpython", "hdfs[avro,dataframe,kerberos] (>=2.0.4)", "hmsclient (>=0.1.0)", "impyla (>=0.18.0,<1.0)", "ipdb", "kubernetes (>=21.7.0,<24)", "mongomock", "moto[cloudformation,glue] (>=4.2.5)", "mypy (==1.2.0)", "mypy-boto3-appflow (>=1.28.0)", "mypy-boto3-rds (>=1.28.0)", "mypy-boto3-redshift-data (>=1.28.0)", "mypy-boto3-s3 (>=1.28.0)", "mysqlclient (>=1.3.6)", "pandas (>=0.17.1)", "pipdeptree", "pre-commit", "presto-python-client (>=0.8.4)", "pyarrow (>=9.0.0)", "pygithub", "pyhive[hive-pure-sasl] (>=0.7.0)", "pykerberos (>=1.1.13)", "pytest", "pytest-asyncio", "pytest-cov", "pytest-httpx", "pytest-instafail", "pytest-mock", "pytest-rerunfailures", "pytest-timeouts", "pytest-xdist", "pywinrm", "requests-kerberos (>=0.10.0)", "requests-mock", "rich-click (>=1.7.0)", "ruff (>=0.0.219)", "semver", "sphinx (>=5.2.0)", "sphinx-airflow-theme", "sphinx-argparse (>=0.1.13)", "sphinx-autoapi (>=2.0.0)", "sphinx-copybutton", "sphinx-jinja (>=2.0)", "sphinx-rtd-theme (>=0.1.6)", "sphinxcontrib-httpdomain (>=1.7.0)", "sphinxcontrib-redoc (>=1.6.0)", "sphinxcontrib-spelling (>=7.3)", "thrift (>=0.9.2)", "thrift-sasl (>=0.2.0)", "time-machine", "towncrier", "twine", "types-Deprecated", "types-Markdown", "types-PyMySQL", "types-PyYAML", "types-certifi", "types-croniter", "types-docutils", "types-paramiko", "types-protobuf", "types-python-dateutil", "types-python-slugify", "types-pytz", "types-redis", "types-requests", "types-setuptools", "types-tabulate", "types-termcolor", "types-toml", "wheel", "yamllint"] +devel = ["aiobotocore (>=2.1.1)", "aioresponses", "apache-airflow (>=2.6.0)", "apache-airflow-providers-common-sql", "astroid (>=2.12.3,<3.0)", "aws-xray-sdk", "backports.zoneinfo (>=0.2.1)", "bcrypt (>=2.0.0)", "beautifulsoup4 (>=4.7.1)", "black", "blinker", "cgroupspy (>=0.2.2)", "checksumdir", "click (>=8.0)", "click (>=8.0,!=8.1.4,!=8.1.5)", "coverage (>=7.2)", "cryptography (>=2.0.0)", "deltalake (>=0.12.0)", "docutils (<0.17.0)", "duckdb (>=0.9.0)", "eralchemy2", "filelock", "flask-bcrypt (>=0.7.1)", "gitpython", "ipdb", "kubernetes (>=21.7.0,<24)", "mongomock", "moto[cloudformation,glue] (>=4.2.9)", "mypy (==1.2.0)", "mypy-boto3-appflow (>=1.28.0)", "mypy-boto3-rds (>=1.28.0)", "mypy-boto3-redshift-data (>=1.28.0)", "mypy-boto3-s3 (>=1.28.0)", "mysql-connector-python (>=8.0.11)", "mysqlclient (>=1.3.6)", "pandas (>=0.17.1)", "pipdeptree", "pre-commit", "pyarrow (>=9.0.0)", "pyarrow-hotfix", "pygithub", "pyiceberg (>=0.5.0)", "pytest (>=7.1)", "pytest-asyncio (!=0.23.0,!=0.23.1)", "pytest-cov", "pytest-httpx", "pytest-icdiff", "pytest-instafail", "pytest-mock", "pytest-rerunfailures", "pytest-timeouts", "pytest-xdist", "python3-saml (>=1.16.0)", "pywinrm", "requests-mock", "restructuredtext-lint", "rich-click (>=1.7.0)", "ruff (>=0.0.219)", "s3fs (>=2023.10.0)", "semver", "sphinx (>=5.2.0)", "sphinx-airflow-theme", "sphinx-argparse (>=0.1.13)", "sphinx-autoapi (>=2.0.0)", "sphinx-copybutton", "sphinx-design (>=0.5.0)", "sphinx-jinja (>=2.0)", "sphinx-rtd-theme (>=0.1.6)", "sphinxcontrib-httpdomain (>=1.7.0)", "sphinxcontrib-redoc (>=1.6.0)", "sphinxcontrib-spelling (>=7.3)", "time-machine", "towncrier", "twine", "types-Deprecated", "types-Markdown", "types-PyMySQL", "types-PyYAML", "types-aiofiles", "types-certifi", "types-croniter", "types-docutils", "types-paramiko", "types-protobuf", "types-python-dateutil", "types-python-slugify", "types-pytz", "types-redis", "types-requests", "types-setuptools", "types-tabulate", "types-termcolor", "types-toml", "wheel", "yamllint"] +devel-all = ["PyGithub (!=1.58)", "PyOpenSSL", "adal (>=1.2.7)", "adlfs (>=2023.10.0)", "aiobotocore (>=2.1.1)", "aiofiles (>=23.2.0)", "aiohttp", "aiohttp (>=3.6.3,<4)", "aioresponses", "alibabacloud-adb20211201 (>=1.0.0)", "alibabacloud-tea-openapi (>=0.3.7)", "amqp", "analytics-python (>=1.2.9)", "apache-airflow (>=2.6.0)", "apache-airflow (>=2.7.0)", "apache-airflow (>=2.8.0)", "apache-airflow-providers-airbyte", "apache-airflow-providers-alibaba", "apache-airflow-providers-amazon", "apache-airflow-providers-apache-beam", "apache-airflow-providers-apache-cassandra", "apache-airflow-providers-apache-drill", "apache-airflow-providers-apache-druid", "apache-airflow-providers-apache-flink", "apache-airflow-providers-apache-hdfs", "apache-airflow-providers-apache-hive", "apache-airflow-providers-apache-impala", "apache-airflow-providers-apache-kafka", "apache-airflow-providers-apache-kylin", "apache-airflow-providers-apache-livy", "apache-airflow-providers-apache-pig", "apache-airflow-providers-apache-pinot", "apache-airflow-providers-apache-spark", "apache-airflow-providers-apprise", "apache-airflow-providers-arangodb", "apache-airflow-providers-asana", "apache-airflow-providers-atlassian-jira", "apache-airflow-providers-celery", "apache-airflow-providers-cloudant", "apache-airflow-providers-cncf-kubernetes", "apache-airflow-providers-cohere", "apache-airflow-providers-common-io", "apache-airflow-providers-common-sql", "apache-airflow-providers-databricks", "apache-airflow-providers-datadog", "apache-airflow-providers-dbt-cloud", "apache-airflow-providers-dingding", "apache-airflow-providers-discord", "apache-airflow-providers-docker", "apache-airflow-providers-elasticsearch", "apache-airflow-providers-exasol", "apache-airflow-providers-facebook", "apache-airflow-providers-ftp", "apache-airflow-providers-github", "apache-airflow-providers-google", "apache-airflow-providers-grpc", "apache-airflow-providers-hashicorp", "apache-airflow-providers-http", "apache-airflow-providers-imap", "apache-airflow-providers-influxdb", "apache-airflow-providers-jdbc", "apache-airflow-providers-jenkins", "apache-airflow-providers-microsoft-azure", "apache-airflow-providers-microsoft-mssql", "apache-airflow-providers-microsoft-psrp", "apache-airflow-providers-microsoft-winrm", "apache-airflow-providers-mongo", "apache-airflow-providers-mysql", "apache-airflow-providers-neo4j", "apache-airflow-providers-odbc", "apache-airflow-providers-openai", "apache-airflow-providers-openfaas", "apache-airflow-providers-openlineage", "apache-airflow-providers-opensearch", "apache-airflow-providers-opsgenie", "apache-airflow-providers-oracle", "apache-airflow-providers-pagerduty", "apache-airflow-providers-papermill", "apache-airflow-providers-pgvector", "apache-airflow-providers-pinecone", "apache-airflow-providers-postgres", "apache-airflow-providers-presto", "apache-airflow-providers-redis", "apache-airflow-providers-salesforce", "apache-airflow-providers-samba", "apache-airflow-providers-segment", "apache-airflow-providers-sendgrid", "apache-airflow-providers-sftp", "apache-airflow-providers-singularity", "apache-airflow-providers-slack", "apache-airflow-providers-smtp", "apache-airflow-providers-snowflake", "apache-airflow-providers-sqlite", "apache-airflow-providers-ssh", "apache-airflow-providers-tableau", "apache-airflow-providers-tabular", "apache-airflow-providers-telegram", "apache-airflow-providers-trino", "apache-airflow-providers-vertica", "apache-airflow-providers-weaviate", "apache-airflow-providers-yandex", "apache-airflow-providers-zendesk", "apache-beam (>=2.47.0)", "apprise", "asana (>=0.10,<4.0.0)", "asgiref", "asgiref (>=3.5.2)", "astroid (>=2.12.3,<3.0)", "atlasclient (>=0.1.2)", "atlassian-python-api (>=1.14.2)", "attrs (>=22.2)", "authlib (>=1.0.0)", "aws-xray-sdk", "azure-batch (>=8.0.0)", "azure-cosmos (>=4.0.0)", "azure-datalake-store (>=0.0.45)", "azure-identity (>=1.3.1)", "azure-keyvault-secrets (>=4.1.0)", "azure-kusto-data (>=4.1.0)", "azure-mgmt-containerinstance (>=9.0.0)", "azure-mgmt-containerregistry (>=8.0.0)", "azure-mgmt-cosmosdb", "azure-mgmt-datafactory (>=2.0.0)", "azure-mgmt-datalake-store (>=0.5.0)", "azure-mgmt-resource (>=2.2.0)", "azure-mgmt-storage (>=16.0.0)", "azure-servicebus (>=7.6.1)", "azure-storage-blob (>=12.14.0)", "azure-storage-file-datalake (>=12.9.1)", "azure-storage-file-share", "azure-synapse-artifacts (>=0.17.0)", "azure-synapse-spark", "backports.zoneinfo (>=0.2.1)", "bcrypt (>=2.0.0)", "beautifulsoup4 (>=4.7.1)", "black", "blinker", "blinker (>=1.1)", "boto3 (>=1.28.0)", "botocore (>=1.31.0)", "cassandra-driver (>=3.13.0)", "celery (>=5.3.0,!=5.3.2,!=5.3.3,<6)", "cgroupspy (>=0.2.2)", "checksumdir", "click (>=8.0)", "click (>=8.0,!=8.1.4,!=8.1.5)", "cloudant (>=2.0)", "cohere (>=4.27)", "confluent-kafka (>=1.8.2)", "coverage (>=7.2)", "cryptography (>=2.0.0)", "databricks-sql-connector (>=2.0.0,!=2.9.0,<3.0.0)", "datadog (>=0.14.0)", "deltalake (>=0.12.0)", "dnspython (>=1.13.0)", "docker (>=5.0.3)", "docutils (<0.17.0)", "duckdb (>=0.9.0)", "elasticsearch (>=8.10,<9)", "eralchemy2", "eventlet (>=0.33.3)", "facebook-business (>=6.0.2)", "filelock", "flask-appbuilder[oauth] (==4.3.10)", "flask-bcrypt (>=0.7.1)", "flower (>=1.0.0)", "gcloud-aio-auth (>=4.0.0,<5.0.0)", "gcloud-aio-bigquery (>=6.1.2)", "gcloud-aio-storage", "gcsfs (>=2023.10.0)", "gevent (>=0.13)", "gitpython", "google-ads (>=22.1.0)", "google-api-core (>=2.11.0)", "google-api-python-client (>=1.6.0)", "google-auth (>=1.0.0)", "google-auth (>=1.0.0,<3.0.0)", "google-auth-httplib2 (>=0.0.1)", "google-cloud-aiplatform (>=1.22.1)", "google-cloud-automl (>=2.12.0)", "google-cloud-batch (>=0.13.0)", "google-cloud-bigquery-datatransfer (>=3.13.0)", "google-cloud-bigtable (>=2.17.0)", "google-cloud-build (>=3.22.0)", "google-cloud-compute (>=1.10.0)", "google-cloud-container (>=2.17.4)", "google-cloud-datacatalog (>=3.11.1)", "google-cloud-dataflow-client (>=0.8.6)", "google-cloud-dataform (>=0.5.0)", "google-cloud-dataplex (>=1.10.0)", "google-cloud-dataproc (>=5.8.0)", "google-cloud-dataproc-metastore (>=1.12.0)", "google-cloud-dlp (>=3.12.0)", "google-cloud-kms (>=2.15.0)", "google-cloud-language (>=2.9.0)", "google-cloud-logging (>=3.5.0)", "google-cloud-memcache (>=1.7.0)", "google-cloud-monitoring (>=2.18.0)", "google-cloud-orchestration-airflow (>=1.10.0)", "google-cloud-os-login (>=2.9.1)", "google-cloud-pubsub (>=2.19.0)", "google-cloud-redis (>=2.12.0)", "google-cloud-run (>=0.9.0)", "google-cloud-secret-manager (>=2.16.0)", "google-cloud-spanner (>=3.11.1)", "google-cloud-speech (>=2.18.0)", "google-cloud-storage (>=2.7.0)", "google-cloud-storage-transfer (>=1.4.1)", "google-cloud-tasks (>=2.13.0)", "google-cloud-texttospeech (>=2.14.1)", "google-cloud-translate (>=3.11.0)", "google-cloud-videointelligence (>=2.11.0)", "google-cloud-vision (>=3.4.0)", "google-cloud-workflows (>=1.10.0)", "google-re2 (>=1.0)", "greenlet (>=0.4.9)", "grpcio (>=1.15.0)", "grpcio-gcp (>=0.2.2)", "grpcio-status", "hdfs[avro,dataframe,kerberos] (>=2.0.4)", "hmsclient (>=0.1.0)", "httpx", "hvac (>=0.10)", "impyla (>=0.18.0,<1.0)", "inflection (>=0.5.1)", "influxdb-client (>=1.19.0)", "ipdb", "ipykernel", "jaydebeapi (>=1.1.1)", "json-merge-patch (>=0.2)", "jsonpath-ng (>=1.5.3)", "kubernetes (>=21.7.0,<24)", "kubernetes-asyncio (>=18.20.1,<25)", "kylinpy (>=2.6)", "ldap3 (>=2.5.1)", "looker-sdk (>=22.2.0)", "mongomock", "moto[cloudformation,glue] (>=4.2.9)", "mypy (==1.2.0)", "mypy-boto3-appflow (>=1.28.0)", "mypy-boto3-rds (>=1.28.0)", "mypy-boto3-redshift-data (>=1.28.0)", "mypy-boto3-s3 (>=1.28.0)", "mysql-connector-python (>=8.0.11)", "mysqlclient (>=1.3.6)", "neo4j (>=4.2.1)", "openai[datalib] (>=1.0)", "openlineage-integration-common (>=0.28.0)", "openlineage-python (>=0.28.0)", "opensearch-py (>=2.2.0)", "opentelemetry-exporter-prometheus", "opsgenie-sdk (>=2.1.5)", "oracledb (>=1.0.0)", "oss2 (>=2.14.0)", "pandas (>=0.17.1)", "pandas-gbq", "papermill[all] (>=1.2.1)", "paramiko (>=2.6.0)", "paramiko (>=2.8.0)", "pdpyras (>=4.1.2)", "pgvector (>=0.2.3)", "pinecone-client (>=2.2.4)", "pinotdb (>0.4.7)", "pipdeptree", "plyvel", "pre-commit", "presto-python-client (>=0.8.4)", "proto-plus (>=1.19.6)", "psycopg2-binary (>=2.8.0)", "pyarrow (>=9.0.0)", "pyarrow-hotfix", "pydruid (>=0.4.1)", "pyexasol (>=0.5.1)", "pygithub", "pyhive[hive-pure-sasl] (>=0.7.0)", "pyiceberg (>=0.5.0)", "pykerberos (>=1.1.13)", "pymongo (>=3.6.0)", "pymssql (>=2.1.8)", "pyodbc", "pypsrp (>=0.8.0)", "pyspark", "pytest (>=7.1)", "pytest-asyncio (!=0.23.0,!=0.23.1)", "pytest-cov", "pytest-httpx", "pytest-icdiff", "pytest-instafail", "pytest-mock", "pytest-rerunfailures", "pytest-timeouts", "pytest-xdist", "python-arango (>=7.3.2)", "python-dotenv (>=0.21.0)", "python-jenkins (>=1.0.0)", "python-ldap", "python-telegram-bot (>=20.0.0)", "python3-saml (>=1.16.0)", "pywinrm", "pywinrm (>=0.4)", "redis (>=4.5.2,!=4.5.5,<5.0.0)", "redshift-connector (>=2.0.888)", "requests (>=2.26.0)", "requests (>=2.27,<3)", "requests-kerberos (>=0.10.0)", "requests-mock", "requests-toolbelt", "restructuredtext-lint", "rich-click (>=1.7.0)", "ruff (>=0.0.219)", "s3fs (>=2023.10.0)", "scrapbook[all]", "semver", "sendgrid (>=6.0.0)", "sentry-sdk (>=1.32.0,!=1.33.0)", "simple-salesforce (>=1.0.0)", "slack-sdk (>=3.0.0)", "smbprotocol (>=1.5.0)", "snowflake-connector-python (>=2.7.8)", "snowflake-sqlalchemy (>=1.1.0)", "sphinx (>=5.2.0)", "sphinx-airflow-theme", "sphinx-argparse (>=0.1.13)", "sphinx-autoapi (>=2.0.0)", "sphinx-copybutton", "sphinx-design (>=0.5.0)", "sphinx-jinja (>=2.0)", "sphinx-rtd-theme (>=0.1.6)", "sphinxcontrib-httpdomain (>=1.7.0)", "sphinxcontrib-redoc (>=1.6.0)", "sphinxcontrib-spelling (>=7.3)", "spython (>=0.0.56)", "sqlalchemy-bigquery (>=1.2.1)", "sqlalchemy-drill (>=1.1.0)", "sqlalchemy-redshift (>=0.8.6)", "sqlalchemy-spanner (>=1.6.2)", "sqlparse (>=0.4.2)", "sshtunnel (>=0.3.2)", "statsd (>=3.3.0)", "tableauserverclient", "thrift (>=0.9.2)", "thrift-sasl (>=0.2.0)", "time-machine", "towncrier", "trino (>=0.318.0)", "twine", "types-Deprecated", "types-Markdown", "types-PyMySQL", "types-PyYAML", "types-aiofiles", "types-certifi", "types-croniter", "types-docutils", "types-paramiko", "types-protobuf", "types-python-dateutil", "types-python-slugify", "types-pytz", "types-redis", "types-requests", "types-setuptools", "types-tabulate", "types-termcolor", "types-toml", "vertica-python (>=0.5.1)", "virtualenv", "watchtower (>=2.0.1,<4)", "weaviate-client (>=3.24.2)", "wheel", "yamllint", "yandexcloud (>=0.228.0)", "zenpy (>=2.0.24)"] +devel-ci = ["PyGithub (!=1.58)", "PyOpenSSL", "adal (>=1.2.7)", "adlfs (>=2023.10.0)", "aiobotocore (>=2.1.1)", "aiofiles (>=23.2.0)", "aiohttp", "aiohttp (>=3.6.3,<4)", "aioresponses", "alibabacloud-adb20211201 (>=1.0.0)", "alibabacloud-tea-openapi (>=0.3.7)", "amqp", "analytics-python (>=1.2.9)", "apache-airflow (>=2.6.0)", "apache-airflow (>=2.7.0)", "apache-airflow (>=2.8.0)", "apache-airflow-providers-airbyte", "apache-airflow-providers-alibaba", "apache-airflow-providers-amazon", "apache-airflow-providers-apache-beam", "apache-airflow-providers-apache-cassandra", "apache-airflow-providers-apache-drill", "apache-airflow-providers-apache-druid", "apache-airflow-providers-apache-flink", "apache-airflow-providers-apache-hdfs", "apache-airflow-providers-apache-hive", "apache-airflow-providers-apache-impala", "apache-airflow-providers-apache-kafka", "apache-airflow-providers-apache-kylin", "apache-airflow-providers-apache-livy", "apache-airflow-providers-apache-pig", "apache-airflow-providers-apache-pinot", "apache-airflow-providers-apache-spark", "apache-airflow-providers-apprise", "apache-airflow-providers-arangodb", "apache-airflow-providers-asana", "apache-airflow-providers-atlassian-jira", "apache-airflow-providers-celery", "apache-airflow-providers-cloudant", "apache-airflow-providers-cncf-kubernetes", "apache-airflow-providers-cohere", "apache-airflow-providers-common-io", "apache-airflow-providers-common-sql", "apache-airflow-providers-databricks", "apache-airflow-providers-datadog", "apache-airflow-providers-dbt-cloud", "apache-airflow-providers-dingding", "apache-airflow-providers-discord", "apache-airflow-providers-docker", "apache-airflow-providers-elasticsearch", "apache-airflow-providers-exasol", "apache-airflow-providers-facebook", "apache-airflow-providers-ftp", "apache-airflow-providers-github", "apache-airflow-providers-google", "apache-airflow-providers-grpc", "apache-airflow-providers-hashicorp", "apache-airflow-providers-http", "apache-airflow-providers-imap", "apache-airflow-providers-influxdb", "apache-airflow-providers-jdbc", "apache-airflow-providers-jenkins", "apache-airflow-providers-microsoft-azure", "apache-airflow-providers-microsoft-mssql", "apache-airflow-providers-microsoft-psrp", "apache-airflow-providers-microsoft-winrm", "apache-airflow-providers-mongo", "apache-airflow-providers-mysql", "apache-airflow-providers-neo4j", "apache-airflow-providers-odbc", "apache-airflow-providers-openai", "apache-airflow-providers-openfaas", "apache-airflow-providers-openlineage", "apache-airflow-providers-opensearch", "apache-airflow-providers-opsgenie", "apache-airflow-providers-oracle", "apache-airflow-providers-pagerduty", "apache-airflow-providers-papermill", "apache-airflow-providers-pgvector", "apache-airflow-providers-pinecone", "apache-airflow-providers-postgres", "apache-airflow-providers-presto", "apache-airflow-providers-redis", "apache-airflow-providers-salesforce", "apache-airflow-providers-samba", "apache-airflow-providers-segment", "apache-airflow-providers-sendgrid", "apache-airflow-providers-sftp", "apache-airflow-providers-singularity", "apache-airflow-providers-slack", "apache-airflow-providers-smtp", "apache-airflow-providers-snowflake", "apache-airflow-providers-sqlite", "apache-airflow-providers-ssh", "apache-airflow-providers-tableau", "apache-airflow-providers-tabular", "apache-airflow-providers-telegram", "apache-airflow-providers-trino", "apache-airflow-providers-vertica", "apache-airflow-providers-weaviate", "apache-airflow-providers-yandex", "apache-airflow-providers-zendesk", "apache-beam (>=2.47.0)", "apprise", "asana (>=0.10,<4.0.0)", "asgiref", "asgiref (>=3.5.2)", "astroid (>=2.12.3,<3.0)", "atlasclient (>=0.1.2)", "atlassian-python-api (>=1.14.2)", "attrs (>=22.2)", "authlib (>=1.0.0)", "aws-xray-sdk", "azure-batch (>=8.0.0)", "azure-cosmos (>=4.0.0)", "azure-datalake-store (>=0.0.45)", "azure-identity (>=1.3.1)", "azure-keyvault-secrets (>=4.1.0)", "azure-kusto-data (>=4.1.0)", "azure-mgmt-containerinstance (>=9.0.0)", "azure-mgmt-containerregistry (>=8.0.0)", "azure-mgmt-cosmosdb", "azure-mgmt-datafactory (>=2.0.0)", "azure-mgmt-datalake-store (>=0.5.0)", "azure-mgmt-resource (>=2.2.0)", "azure-mgmt-storage (>=16.0.0)", "azure-servicebus (>=7.6.1)", "azure-storage-blob (>=12.14.0)", "azure-storage-file-datalake (>=12.9.1)", "azure-storage-file-share", "azure-synapse-artifacts (>=0.17.0)", "azure-synapse-spark", "backports.zoneinfo (>=0.2.1)", "bcrypt (>=2.0.0)", "beautifulsoup4 (>=4.7.1)", "black", "blinker", "blinker (>=1.1)", "boto3 (>=1.28.0)", "botocore (>=1.31.0)", "cassandra-driver (>=3.13.0)", "celery (>=5.3.0,!=5.3.2,!=5.3.3,<6)", "cgroupspy (>=0.2.2)", "checksumdir", "click (>=8.0)", "click (>=8.0,!=8.1.4,!=8.1.5)", "cloudant (>=2.0)", "cohere (>=4.27)", "confluent-kafka (>=1.8.2)", "coverage (>=7.2)", "cryptography (>=2.0.0)", "databricks-sql-connector (>=2.0.0,!=2.9.0,<3.0.0)", "datadog (>=0.14.0)", "deltalake (>=0.12.0)", "dnspython (>=1.13.0)", "docker (>=5.0.3)", "docutils (<0.17.0)", "duckdb (>=0.9.0)", "elasticsearch (>=8.10,<9)", "eralchemy2", "eventlet (>=0.33.3)", "facebook-business (>=6.0.2)", "filelock", "flask-appbuilder[oauth] (==4.3.10)", "flask-bcrypt (>=0.7.1)", "flower (>=1.0.0)", "gcloud-aio-auth (>=4.0.0,<5.0.0)", "gcloud-aio-bigquery (>=6.1.2)", "gcloud-aio-storage", "gcsfs (>=2023.10.0)", "gevent (>=0.13)", "gitpython", "google-ads (>=22.1.0)", "google-api-core (>=2.11.0)", "google-api-python-client (>=1.6.0)", "google-auth (>=1.0.0)", "google-auth (>=1.0.0,<3.0.0)", "google-auth-httplib2 (>=0.0.1)", "google-cloud-aiplatform (>=1.22.1)", "google-cloud-automl (>=2.12.0)", "google-cloud-batch (>=0.13.0)", "google-cloud-bigquery-datatransfer (>=3.13.0)", "google-cloud-bigtable (>=2.17.0)", "google-cloud-build (>=3.22.0)", "google-cloud-compute (>=1.10.0)", "google-cloud-container (>=2.17.4)", "google-cloud-datacatalog (>=3.11.1)", "google-cloud-dataflow-client (>=0.8.6)", "google-cloud-dataform (>=0.5.0)", "google-cloud-dataplex (>=1.10.0)", "google-cloud-dataproc (>=5.8.0)", "google-cloud-dataproc-metastore (>=1.12.0)", "google-cloud-dlp (>=3.12.0)", "google-cloud-kms (>=2.15.0)", "google-cloud-language (>=2.9.0)", "google-cloud-logging (>=3.5.0)", "google-cloud-memcache (>=1.7.0)", "google-cloud-monitoring (>=2.18.0)", "google-cloud-orchestration-airflow (>=1.10.0)", "google-cloud-os-login (>=2.9.1)", "google-cloud-pubsub (>=2.19.0)", "google-cloud-redis (>=2.12.0)", "google-cloud-run (>=0.9.0)", "google-cloud-secret-manager (>=2.16.0)", "google-cloud-spanner (>=3.11.1)", "google-cloud-speech (>=2.18.0)", "google-cloud-storage (>=2.7.0)", "google-cloud-storage-transfer (>=1.4.1)", "google-cloud-tasks (>=2.13.0)", "google-cloud-texttospeech (>=2.14.1)", "google-cloud-translate (>=3.11.0)", "google-cloud-videointelligence (>=2.11.0)", "google-cloud-vision (>=3.4.0)", "google-cloud-workflows (>=1.10.0)", "google-re2 (>=1.0)", "greenlet (>=0.4.9)", "grpcio (>=1.15.0)", "grpcio-gcp (>=0.2.2)", "grpcio-status", "hdfs[avro,dataframe,kerberos] (>=2.0.4)", "hmsclient (>=0.1.0)", "httpx", "hvac (>=0.10)", "impyla (>=0.18.0,<1.0)", "inflection (>=0.5.1)", "influxdb-client (>=1.19.0)", "ipdb", "ipykernel", "jaydebeapi (>=1.1.1)", "json-merge-patch (>=0.2)", "jsonpath-ng (>=1.5.3)", "kubernetes (>=21.7.0,<24)", "kubernetes-asyncio (>=18.20.1,<25)", "kylinpy (>=2.6)", "ldap3 (>=2.5.1)", "looker-sdk (>=22.2.0)", "mongomock", "moto[cloudformation,glue] (>=4.2.9)", "mypy (==1.2.0)", "mypy-boto3-appflow (>=1.28.0)", "mypy-boto3-rds (>=1.28.0)", "mypy-boto3-redshift-data (>=1.28.0)", "mypy-boto3-s3 (>=1.28.0)", "mysql-connector-python (>=8.0.11)", "mysqlclient (>=1.3.6)", "neo4j (>=4.2.1)", "openai[datalib] (>=1.0)", "openlineage-integration-common (>=0.28.0)", "openlineage-python (>=0.28.0)", "opensearch-py (>=2.2.0)", "opentelemetry-exporter-prometheus", "opsgenie-sdk (>=2.1.5)", "oracledb (>=1.0.0)", "oss2 (>=2.14.0)", "pandas (>=0.17.1)", "pandas-gbq", "papermill[all] (>=1.2.1)", "paramiko (>=2.6.0)", "paramiko (>=2.8.0)", "pdpyras (>=4.1.2)", "pgvector (>=0.2.3)", "pinecone-client (>=2.2.4)", "pinotdb (>0.4.7)", "pipdeptree", "plyvel", "pre-commit", "presto-python-client (>=0.8.4)", "proto-plus (>=1.19.6)", "psycopg2-binary (>=2.8.0)", "pyarrow (>=9.0.0)", "pyarrow-hotfix", "pydruid (>=0.4.1)", "pyexasol (>=0.5.1)", "pygithub", "pyhive[hive-pure-sasl] (>=0.7.0)", "pyiceberg (>=0.5.0)", "pykerberos (>=1.1.13)", "pymongo (>=3.6.0)", "pymssql (>=2.1.8)", "pyodbc", "pypsrp (>=0.8.0)", "pyspark", "pytest (>=7.1)", "pytest-asyncio (!=0.23.0,!=0.23.1)", "pytest-cov", "pytest-httpx", "pytest-icdiff", "pytest-instafail", "pytest-mock", "pytest-rerunfailures", "pytest-timeouts", "pytest-xdist", "python-arango (>=7.3.2)", "python-dotenv (>=0.21.0)", "python-jenkins (>=1.0.0)", "python-ldap", "python-telegram-bot (>=20.0.0)", "python3-saml (>=1.16.0)", "pywinrm", "pywinrm (>=0.4)", "redis (>=4.5.2,!=4.5.5,<5.0.0)", "redshift-connector (>=2.0.888)", "requests (>=2.26.0)", "requests (>=2.27,<3)", "requests-kerberos (>=0.10.0)", "requests-mock", "requests-toolbelt", "restructuredtext-lint", "rich-click (>=1.7.0)", "ruff (>=0.0.219)", "s3fs (>=2023.10.0)", "scrapbook[all]", "semver", "sendgrid (>=6.0.0)", "sentry-sdk (>=1.32.0,!=1.33.0)", "simple-salesforce (>=1.0.0)", "slack-sdk (>=3.0.0)", "smbprotocol (>=1.5.0)", "snowflake-connector-python (>=2.7.8)", "snowflake-sqlalchemy (>=1.1.0)", "sphinx (>=5.2.0)", "sphinx-airflow-theme", "sphinx-argparse (>=0.1.13)", "sphinx-autoapi (>=2.0.0)", "sphinx-copybutton", "sphinx-design (>=0.5.0)", "sphinx-jinja (>=2.0)", "sphinx-rtd-theme (>=0.1.6)", "sphinxcontrib-httpdomain (>=1.7.0)", "sphinxcontrib-redoc (>=1.6.0)", "sphinxcontrib-spelling (>=7.3)", "spython (>=0.0.56)", "sqlalchemy-bigquery (>=1.2.1)", "sqlalchemy-drill (>=1.1.0)", "sqlalchemy-redshift (>=0.8.6)", "sqlalchemy-spanner (>=1.6.2)", "sqlparse (>=0.4.2)", "sshtunnel (>=0.3.2)", "statsd (>=3.3.0)", "tableauserverclient", "thrift (>=0.9.2)", "thrift-sasl (>=0.2.0)", "time-machine", "towncrier", "trino (>=0.318.0)", "twine", "types-Deprecated", "types-Markdown", "types-PyMySQL", "types-PyYAML", "types-aiofiles", "types-certifi", "types-croniter", "types-docutils", "types-paramiko", "types-protobuf", "types-python-dateutil", "types-python-slugify", "types-pytz", "types-redis", "types-requests", "types-setuptools", "types-tabulate", "types-termcolor", "types-toml", "vertica-python (>=0.5.1)", "virtualenv", "watchtower (>=2.0.1,<4)", "weaviate-client (>=3.24.2)", "wheel", "yamllint", "yandexcloud (>=0.228.0)", "zenpy (>=2.0.24)"] +devel-hadoop = ["aiobotocore (>=2.1.1)", "aioresponses", "apache-airflow (>=2.6.0)", "apache-airflow-providers-apache-hdfs", "apache-airflow-providers-apache-hive", "apache-airflow-providers-common-sql", "apache-airflow-providers-presto", "apache-airflow-providers-trino", "astroid (>=2.12.3,<3.0)", "aws-xray-sdk", "backports.zoneinfo (>=0.2.1)", "bcrypt (>=2.0.0)", "beautifulsoup4 (>=4.7.1)", "black", "blinker", "cgroupspy (>=0.2.2)", "checksumdir", "click (>=8.0)", "click (>=8.0,!=8.1.4,!=8.1.5)", "coverage (>=7.2)", "cryptography (>=2.0.0)", "deltalake (>=0.12.0)", "docutils (<0.17.0)", "duckdb (>=0.9.0)", "eralchemy2", "filelock", "flask-bcrypt (>=0.7.1)", "gitpython", "hdfs[avro,dataframe,kerberos] (>=2.0.4)", "hmsclient (>=0.1.0)", "impyla (>=0.18.0,<1.0)", "ipdb", "kubernetes (>=21.7.0,<24)", "mongomock", "moto[cloudformation,glue] (>=4.2.9)", "mypy (==1.2.0)", "mypy-boto3-appflow (>=1.28.0)", "mypy-boto3-rds (>=1.28.0)", "mypy-boto3-redshift-data (>=1.28.0)", "mypy-boto3-s3 (>=1.28.0)", "mysql-connector-python (>=8.0.11)", "mysqlclient (>=1.3.6)", "pandas (>=0.17.1)", "pipdeptree", "pre-commit", "presto-python-client (>=0.8.4)", "pyarrow (>=9.0.0)", "pyarrow-hotfix", "pygithub", "pyhive[hive-pure-sasl] (>=0.7.0)", "pyiceberg (>=0.5.0)", "pykerberos (>=1.1.13)", "pytest (>=7.1)", "pytest-asyncio (!=0.23.0,!=0.23.1)", "pytest-cov", "pytest-httpx", "pytest-icdiff", "pytest-instafail", "pytest-mock", "pytest-rerunfailures", "pytest-timeouts", "pytest-xdist", "python3-saml (>=1.16.0)", "pywinrm", "requests-kerberos (>=0.10.0)", "requests-mock", "restructuredtext-lint", "rich-click (>=1.7.0)", "ruff (>=0.0.219)", "s3fs (>=2023.10.0)", "semver", "sphinx (>=5.2.0)", "sphinx-airflow-theme", "sphinx-argparse (>=0.1.13)", "sphinx-autoapi (>=2.0.0)", "sphinx-copybutton", "sphinx-design (>=0.5.0)", "sphinx-jinja (>=2.0)", "sphinx-rtd-theme (>=0.1.6)", "sphinxcontrib-httpdomain (>=1.7.0)", "sphinxcontrib-redoc (>=1.6.0)", "sphinxcontrib-spelling (>=7.3)", "thrift (>=0.9.2)", "thrift-sasl (>=0.2.0)", "time-machine", "towncrier", "twine", "types-Deprecated", "types-Markdown", "types-PyMySQL", "types-PyYAML", "types-aiofiles", "types-certifi", "types-croniter", "types-docutils", "types-paramiko", "types-protobuf", "types-python-dateutil", "types-python-slugify", "types-pytz", "types-redis", "types-requests", "types-setuptools", "types-tabulate", "types-termcolor", "types-toml", "wheel", "yamllint"] dingding = ["apache-airflow-providers-dingding"] discord = ["apache-airflow-providers-discord"] -doc = ["astroid (>=2.12.3,<3.0)", "checksumdir", "click (>=8.0,!=8.1.4,!=8.1.5)", "docutils (<0.17.0)", "eralchemy2", "sphinx (>=5.2.0)", "sphinx-airflow-theme", "sphinx-argparse (>=0.1.13)", "sphinx-autoapi (>=2.0.0)", "sphinx-copybutton", "sphinx-jinja (>=2.0)", "sphinx-rtd-theme (>=0.1.6)", "sphinxcontrib-httpdomain (>=1.7.0)", "sphinxcontrib-redoc (>=1.6.0)", "sphinxcontrib-spelling (>=7.3)"] +doc = ["astroid (>=2.12.3,<3.0)", "checksumdir", "click (>=8.0,!=8.1.4,!=8.1.5)", "docutils (<0.17.0)", "eralchemy2", "sphinx (>=5.2.0)", "sphinx-airflow-theme", "sphinx-argparse (>=0.1.13)", "sphinx-autoapi (>=2.0.0)", "sphinx-copybutton", "sphinx-design (>=0.5.0)", "sphinx-jinja (>=2.0)", "sphinx-rtd-theme (>=0.1.6)", "sphinxcontrib-httpdomain (>=1.7.0)", "sphinxcontrib-redoc (>=1.6.0)", "sphinxcontrib-spelling (>=7.3)"] doc-gen = ["eralchemy2"] docker = ["apache-airflow-providers-docker"] druid = ["apache-airflow-providers-apache-druid"] @@ -349,9 +348,9 @@ ftp = ["apache-airflow-providers-ftp"] gcp = ["apache-airflow-providers-google"] gcp-api = ["apache-airflow-providers-google"] github = ["apache-airflow-providers-github"] -github-enterprise = ["authlib (>=1.0.0)", "flask-appbuilder[oauth] (==4.3.6)"] +github-enterprise = ["authlib (>=1.0.0)", "flask-appbuilder[oauth] (==4.3.10)"] google = ["apache-airflow-providers-google"] -google-auth = ["authlib (>=1.0.0)", "flask-appbuilder[oauth] (==4.3.6)"] +google-auth = ["authlib (>=1.0.0)", "flask-appbuilder[oauth] (==4.3.10)"] grpc = ["apache-airflow-providers-grpc"] hashicorp = ["apache-airflow-providers-hashicorp"] hdfs = ["apache-airflow-providers-apache-hdfs"] @@ -362,7 +361,7 @@ influxdb = ["apache-airflow-providers-influxdb"] jdbc = ["apache-airflow-providers-jdbc"] jenkins = ["apache-airflow-providers-jenkins"] kerberos = ["pykerberos (>=1.1.13)", "requests-kerberos (>=0.10.0)", "thrift-sasl (>=0.2.0)"] -kubernetes = ["apache-airflow (>=2.4.0)", "apache-airflow-providers-cncf-kubernetes", "asgiref (>=3.5.2)", "cryptography (>=2.0.0)", "kubernetes (>=21.7.0,<24)", "kubernetes-asyncio (>=18.20.1,<25)"] +kubernetes = ["aiofiles (>=23.2.0)", "apache-airflow (>=2.6.0)", "apache-airflow-providers-cncf-kubernetes", "asgiref (>=3.5.2)", "cryptography (>=2.0.0)", "google-re2 (>=1.0)", "kubernetes (>=21.7.0,<24)", "kubernetes-asyncio (>=18.20.1,<25)"] ldap = ["ldap3 (>=2.5.1)", "python-ldap"] leveldb = ["plyvel"] microsoft-azure = ["apache-airflow-providers-microsoft-azure"] @@ -374,6 +373,7 @@ mssql = ["apache-airflow-providers-microsoft-mssql"] mysql = ["apache-airflow-providers-mysql"] neo4j = ["apache-airflow-providers-neo4j"] odbc = ["apache-airflow-providers-odbc"] +openai = ["apache-airflow-providers-openai"] openfaas = ["apache-airflow-providers-openfaas"] openlineage = ["apache-airflow-providers-openlineage"] opensearch = ["apache-airflow-providers-opensearch"] @@ -381,19 +381,21 @@ opsgenie = ["apache-airflow-providers-opsgenie"] oracle = ["apache-airflow-providers-oracle"] otel = ["opentelemetry-exporter-prometheus"] pagerduty = ["apache-airflow-providers-pagerduty"] -pandas = ["pandas (>=0.17.1)", "pyarrow (>=9.0.0)"] +pandas = ["pandas (>=0.17.1)", "pyarrow (>=9.0.0)", "pyarrow-hotfix"] papermill = ["apache-airflow-providers-papermill"] password = ["bcrypt (>=2.0.0)", "flask-bcrypt (>=0.7.1)"] +pgvector = ["apache-airflow-providers-pgvector"] +pinecone = ["apache-airflow-providers-pinecone"] pinot = ["apache-airflow-providers-apache-pinot"] -plexus = ["apache-airflow-providers-plexus"] postgres = ["apache-airflow-providers-postgres"] presto = ["apache-airflow-providers-presto"] -qds = ["apache-airflow-providers-qubole"] rabbitmq = ["amqp"] redis = ["apache-airflow-providers-redis"] s3 = ["apache-airflow-providers-amazon"] +s3fs = ["s3fs (>=2023.10.0)"] salesforce = ["apache-airflow-providers-salesforce"] samba = ["apache-airflow-providers-samba"] +saml = ["python3-saml (>=1.16.0)"] segment = ["apache-airflow-providers-segment"] sendgrid = ["apache-airflow-providers-sendgrid"] sentry = ["blinker (>=1.1)", "sentry-sdk (>=1.32.0,!=1.33.0)"] @@ -412,8 +414,10 @@ telegram = ["apache-airflow-providers-telegram"] trino = ["apache-airflow-providers-trino"] vertica = ["apache-airflow-providers-vertica"] virtualenv = ["virtualenv"] +weaviate = ["apache-airflow-providers-weaviate"] webhdfs = ["hdfs[avro,dataframe,kerberos] (>=2.0.4)"] winrm = ["apache-airflow-providers-microsoft-winrm"] +yandex = ["apache-airflow-providers-yandex"] zendesk = ["apache-airflow-providers-zendesk"] [[package]] @@ -1867,13 +1871,13 @@ dotenv = ["python-dotenv"] [[package]] name = "flask-appbuilder" -version = "4.3.6" +version = "4.3.10" description = "Simple and rapid application development framework, built on top of Flask. includes detailed security, auto CRUD generation for your models, google charts and much more." optional = false python-versions = "~=3.7" files = [ - {file = "Flask-AppBuilder-4.3.6.tar.gz", hash = "sha256:8ca9710fa7d2704747d195e11b487d45a571f40559d8399d9d5dfa42ea1f3c78"}, - {file = "Flask_AppBuilder-4.3.6-py3-none-any.whl", hash = "sha256:840480dfd43134bebf78f3c7dc909e324c2689d2d9f27aeb1880a8a25466bc8d"}, + {file = "Flask-AppBuilder-4.3.10.tar.gz", hash = "sha256:4173c878e56b81c6acac5e3c80c133f4183f43442fd944552bd9f4023f5baceb"}, + {file = "Flask_AppBuilder-4.3.10-py3-none-any.whl", hash = "sha256:c0af506e1a68e7ee14f26a16fda829f1a14f8343654c30bdbb1351d23c545df9"}, ] [package.dependencies] @@ -1881,7 +1885,7 @@ apispec = {version = ">=6.0.0,<7", extras = ["yaml"]} click = ">=8,<9" colorama = ">=0.3.9,<1" email-validator = ">=1.0.5,<2" -Flask = ">=2,<3" +Flask = ">=2,<2.3.0" Flask-Babel = ">=1,<3" Flask-JWT-Extended = ">=4.0.0,<5.0.0" Flask-Limiter = ">3,<4" @@ -1896,6 +1900,7 @@ PyJWT = ">=2.0.0,<3.0.0" python-dateutil = ">=2.3,<3" SQLAlchemy = "<1.5" sqlalchemy-utils = ">=0.32.21,<1" +werkzeug = "<3" WTForms = "<4" [package.extras] @@ -7990,6 +7995,24 @@ files = [ {file = "unicodecsv-0.14.1.tar.gz", hash = "sha256:018c08037d48649a0412063ff4eda26eaa81eff1546dbffa51fa5293276ff7fc"}, ] +[[package]] +name = "universal-pathlib" +version = "0.1.4" +description = "pathlib api extended to use fsspec backends" +optional = false +python-versions = ">=3.8" +files = [ + {file = "universal_pathlib-0.1.4-py3-none-any.whl", hash = "sha256:f99186cf950bde1262de9a590bb019613ef84f9fabd9f276e8b019722201943a"}, + {file = "universal_pathlib-0.1.4.tar.gz", hash = "sha256:82e5d86d16a27e0ea1adc7d88acbcba9d02d5a45488163174f96d9ac289db2e4"}, +] + +[package.dependencies] +fsspec = ">=2022.1.0" + +[package.extras] +dev = ["adlfs", "aiohttp", "cheroot", "gcsfs", "hadoop-test-cluster", "moto[s3,server]", "mypy (==1.3.0)", "packaging", "pyarrow", "pydantic", "pydantic-settings", "pylint (==2.17.4)", "pytest (==7.3.2)", "pytest-cov (==4.1.0)", "pytest-mock (==3.11.1)", "pytest-sugar (==0.9.6)", "requests", "s3fs", "webdav4[fsspec]", "wsgidav"] +tests = ["mypy (==1.3.0)", "packaging", "pylint (==2.17.4)", "pytest (==7.3.2)", "pytest-cov (==4.1.0)", "pytest-mock (==3.11.1)", "pytest-sugar (==0.9.6)"] + [[package]] name = "uritemplate" version = "4.1.1" @@ -8462,4 +8485,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p [metadata] lock-version = "2.0" python-versions = "3.10.8" -content-hash = "61c92b5469c17469a2109c72da3789ef57dee5949201c36c0270aec674c8e079" +content-hash = "3d523163b0dda0e2c1a275e489768a7c70e2808552e94c7d671bfb482b1ddccd" diff --git a/pyproject.toml b/pyproject.toml index 507ed180d..0a3029dbe 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -68,7 +68,7 @@ pytest-xdist = "^3.5.0" ipython = "^8.18.1" ipykernel = "^6.27.1" google-cloud-dataproc = "^5.8.0" -apache-airflow = "^2.7.3" +apache-airflow = "^2.8.0" apache-airflow-providers-google = "^10.12.0" pydoclint = "^0.3.8" prettier = "^0.0.7" From f2720e9a6ce2d728f4e5d7f3a0ea3c45efb2f18a Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 2 Jan 2024 14:18:24 +0000 Subject: [PATCH 14/21] build(deps-dev): bump mypy from 1.7.1 to 1.8.0 (#374) Bumps [mypy](https://github.com/python/mypy) from 1.7.1 to 1.8.0. - [Changelog](https://github.com/python/mypy/blob/master/CHANGELOG.md) - [Commits](https://github.com/python/mypy/compare/v1.7.1...v1.8.0) --- updated-dependencies: - dependency-name: mypy dependency-type: direct:development update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- poetry.lock | 58 +++++++++++++++++++++++++------------------------- pyproject.toml | 2 +- 2 files changed, 30 insertions(+), 30 deletions(-) diff --git a/poetry.lock b/poetry.lock index d22839d91..4f54408c5 100644 --- a/poetry.lock +++ b/poetry.lock @@ -5077,38 +5077,38 @@ files = [ [[package]] name = "mypy" -version = "1.7.1" +version = "1.8.0" description = "Optional static typing for Python" optional = false python-versions = ">=3.8" files = [ - {file = "mypy-1.7.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:12cce78e329838d70a204293e7b29af9faa3ab14899aec397798a4b41be7f340"}, - {file = "mypy-1.7.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:1484b8fa2c10adf4474f016e09d7a159602f3239075c7bf9f1627f5acf40ad49"}, - {file = "mypy-1.7.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:31902408f4bf54108bbfb2e35369877c01c95adc6192958684473658c322c8a5"}, - {file = "mypy-1.7.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:f2c2521a8e4d6d769e3234350ba7b65ff5d527137cdcde13ff4d99114b0c8e7d"}, - {file = "mypy-1.7.1-cp310-cp310-win_amd64.whl", hash = "sha256:fcd2572dd4519e8a6642b733cd3a8cfc1ef94bafd0c1ceed9c94fe736cb65b6a"}, - {file = "mypy-1.7.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4b901927f16224d0d143b925ce9a4e6b3a758010673eeded9b748f250cf4e8f7"}, - {file = "mypy-1.7.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2f7f6985d05a4e3ce8255396df363046c28bea790e40617654e91ed580ca7c51"}, - {file = "mypy-1.7.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:944bdc21ebd620eafefc090cdf83158393ec2b1391578359776c00de00e8907a"}, - {file = "mypy-1.7.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:9c7ac372232c928fff0645d85f273a726970c014749b924ce5710d7d89763a28"}, - {file = "mypy-1.7.1-cp311-cp311-win_amd64.whl", hash = "sha256:f6efc9bd72258f89a3816e3a98c09d36f079c223aa345c659622f056b760ab42"}, - {file = "mypy-1.7.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:6dbdec441c60699288adf051f51a5d512b0d818526d1dcfff5a41f8cd8b4aaf1"}, - {file = "mypy-1.7.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:4fc3d14ee80cd22367caaaf6e014494415bf440980a3045bf5045b525680ac33"}, - {file = "mypy-1.7.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2c6e4464ed5f01dc44dc9821caf67b60a4e5c3b04278286a85c067010653a0eb"}, - {file = "mypy-1.7.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:d9b338c19fa2412f76e17525c1b4f2c687a55b156320acb588df79f2e6fa9fea"}, - {file = "mypy-1.7.1-cp312-cp312-win_amd64.whl", hash = "sha256:204e0d6de5fd2317394a4eff62065614c4892d5a4d1a7ee55b765d7a3d9e3f82"}, - {file = "mypy-1.7.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:84860e06ba363d9c0eeabd45ac0fde4b903ad7aa4f93cd8b648385a888e23200"}, - {file = "mypy-1.7.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:8c5091ebd294f7628eb25ea554852a52058ac81472c921150e3a61cdd68f75a7"}, - {file = "mypy-1.7.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:40716d1f821b89838589e5b3106ebbc23636ffdef5abc31f7cd0266db936067e"}, - {file = "mypy-1.7.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:5cf3f0c5ac72139797953bd50bc6c95ac13075e62dbfcc923571180bebb662e9"}, - {file = "mypy-1.7.1-cp38-cp38-win_amd64.whl", hash = "sha256:78e25b2fd6cbb55ddfb8058417df193f0129cad5f4ee75d1502248e588d9e0d7"}, - {file = "mypy-1.7.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:75c4d2a6effd015786c87774e04331b6da863fc3fc4e8adfc3b40aa55ab516fe"}, - {file = "mypy-1.7.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:2643d145af5292ee956aa0a83c2ce1038a3bdb26e033dadeb2f7066fb0c9abce"}, - {file = "mypy-1.7.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:75aa828610b67462ffe3057d4d8a4112105ed211596b750b53cbfe182f44777a"}, - {file = "mypy-1.7.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:ee5d62d28b854eb61889cde4e1dbc10fbaa5560cb39780c3995f6737f7e82120"}, - {file = "mypy-1.7.1-cp39-cp39-win_amd64.whl", hash = "sha256:72cf32ce7dd3562373f78bd751f73c96cfb441de147cc2448a92c1a308bd0ca6"}, - {file = "mypy-1.7.1-py3-none-any.whl", hash = "sha256:f7c5d642db47376a0cc130f0de6d055056e010debdaf0707cd2b0fc7e7ef30ea"}, - {file = "mypy-1.7.1.tar.gz", hash = "sha256:fcb6d9afb1b6208b4c712af0dafdc650f518836065df0d4fb1d800f5d6773db2"}, + {file = "mypy-1.8.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:485a8942f671120f76afffff70f259e1cd0f0cfe08f81c05d8816d958d4577d3"}, + {file = "mypy-1.8.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:df9824ac11deaf007443e7ed2a4a26bebff98d2bc43c6da21b2b64185da011c4"}, + {file = "mypy-1.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2afecd6354bbfb6e0160f4e4ad9ba6e4e003b767dd80d85516e71f2e955ab50d"}, + {file = "mypy-1.8.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8963b83d53ee733a6e4196954502b33567ad07dfd74851f32be18eb932fb1cb9"}, + {file = "mypy-1.8.0-cp310-cp310-win_amd64.whl", hash = "sha256:e46f44b54ebddbeedbd3d5b289a893219065ef805d95094d16a0af6630f5d410"}, + {file = "mypy-1.8.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:855fe27b80375e5c5878492f0729540db47b186509c98dae341254c8f45f42ae"}, + {file = "mypy-1.8.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:4c886c6cce2d070bd7df4ec4a05a13ee20c0aa60cb587e8d1265b6c03cf91da3"}, + {file = "mypy-1.8.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d19c413b3c07cbecf1f991e2221746b0d2a9410b59cb3f4fb9557f0365a1a817"}, + {file = "mypy-1.8.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:9261ed810972061388918c83c3f5cd46079d875026ba97380f3e3978a72f503d"}, + {file = "mypy-1.8.0-cp311-cp311-win_amd64.whl", hash = "sha256:51720c776d148bad2372ca21ca29256ed483aa9a4cdefefcef49006dff2a6835"}, + {file = "mypy-1.8.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:52825b01f5c4c1c4eb0db253ec09c7aa17e1a7304d247c48b6f3599ef40db8bd"}, + {file = "mypy-1.8.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f5ac9a4eeb1ec0f1ccdc6f326bcdb464de5f80eb07fb38b5ddd7b0de6bc61e55"}, + {file = "mypy-1.8.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:afe3fe972c645b4632c563d3f3eff1cdca2fa058f730df2b93a35e3b0c538218"}, + {file = "mypy-1.8.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:42c6680d256ab35637ef88891c6bd02514ccb7e1122133ac96055ff458f93fc3"}, + {file = "mypy-1.8.0-cp312-cp312-win_amd64.whl", hash = "sha256:720a5ca70e136b675af3af63db533c1c8c9181314d207568bbe79051f122669e"}, + {file = "mypy-1.8.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:028cf9f2cae89e202d7b6593cd98db6759379f17a319b5faf4f9978d7084cdc6"}, + {file = "mypy-1.8.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:4e6d97288757e1ddba10dd9549ac27982e3e74a49d8d0179fc14d4365c7add66"}, + {file = "mypy-1.8.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7f1478736fcebb90f97e40aff11a5f253af890c845ee0c850fe80aa060a267c6"}, + {file = "mypy-1.8.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:42419861b43e6962a649068a61f4a4839205a3ef525b858377a960b9e2de6e0d"}, + {file = "mypy-1.8.0-cp38-cp38-win_amd64.whl", hash = "sha256:2b5b6c721bd4aabaadead3a5e6fa85c11c6c795e0c81a7215776ef8afc66de02"}, + {file = "mypy-1.8.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:5c1538c38584029352878a0466f03a8ee7547d7bd9f641f57a0f3017a7c905b8"}, + {file = "mypy-1.8.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:4ef4be7baf08a203170f29e89d79064463b7fc7a0908b9d0d5114e8009c3a259"}, + {file = "mypy-1.8.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7178def594014aa6c35a8ff411cf37d682f428b3b5617ca79029d8ae72f5402b"}, + {file = "mypy-1.8.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:ab3c84fa13c04aeeeabb2a7f67a25ef5d77ac9d6486ff33ded762ef353aa5592"}, + {file = "mypy-1.8.0-cp39-cp39-win_amd64.whl", hash = "sha256:99b00bc72855812a60d253420d8a2eae839b0afa4938f09f4d2aa9bb4654263a"}, + {file = "mypy-1.8.0-py3-none-any.whl", hash = "sha256:538fd81bb5e430cc1381a443971c0475582ff9f434c16cd46d2c66763ce85d9d"}, + {file = "mypy-1.8.0.tar.gz", hash = "sha256:6ff8b244d7085a0b425b56d327b480c3b29cafbd2eff27316a004f9a7391ae07"}, ] [package.dependencies] @@ -8485,4 +8485,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p [metadata] lock-version = "2.0" python-versions = "3.10.8" -content-hash = "3d523163b0dda0e2c1a275e489768a7c70e2808552e94c7d671bfb482b1ddccd" +content-hash = "771c0436f071cff05a0452e2152ee6dede822b73c18efca1d784e6c3ecb74bbe" diff --git a/pyproject.toml b/pyproject.toml index 0a3029dbe..5e3b0257a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,7 +34,7 @@ scikit-learn = "^1.3.2" [tool.poetry.dev-dependencies] pre-commit = "^3.6.0" -mypy = "^1.7" +mypy = "^1.8" pep8-naming = "^0.13.2" interrogate = "^1.5.0" isort = "^5.13.2" From 268ab32f89ebfc4a360e589d4a4e3125a40dc800 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 2 Jan 2024 14:28:27 +0000 Subject: [PATCH 15/21] chore(deps): bump python-semantic-release/python-semantic-release (#372) Bumps [python-semantic-release/python-semantic-release](https://github.com/python-semantic-release/python-semantic-release) from 8.5.1 to 8.7.0. - [Release notes](https://github.com/python-semantic-release/python-semantic-release/releases) - [Changelog](https://github.com/python-semantic-release/python-semantic-release/blob/master/CHANGELOG.md) - [Commits](https://github.com/python-semantic-release/python-semantic-release/compare/v8.5.1...v8.7.0) --- updated-dependencies: - dependency-name: python-semantic-release/python-semantic-release dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Kirill Tsukanov --- .github/workflows/release.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 8f5cf3c3e..4fe48f20f 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -58,7 +58,7 @@ jobs: - name: Python Semantic Release id: release - uses: python-semantic-release/python-semantic-release@v8.5.1 + uses: python-semantic-release/python-semantic-release@v8.7.0 with: github_token: ${{ secrets.GITHUB_TOKEN }} From 4db09d2d8a53d7ecec3ef45714dc8ef6f0068ea4 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 2 Jan 2024 14:33:11 +0000 Subject: [PATCH 16/21] build(deps-dev): bump python-semantic-release from 8.5.1 to 8.7.0 (#375) Bumps [python-semantic-release](https://github.com/python-semantic-release/python-semantic-release) from 8.5.1 to 8.7.0. - [Release notes](https://github.com/python-semantic-release/python-semantic-release/releases) - [Changelog](https://github.com/python-semantic-release/python-semantic-release/blob/master/CHANGELOG.md) - [Commits](https://github.com/python-semantic-release/python-semantic-release/compare/v8.5.1...v8.7.0) --- updated-dependencies: - dependency-name: python-semantic-release dependency-type: direct:development update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Kirill Tsukanov --- poetry.lock | 10 +++++----- pyproject.toml | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/poetry.lock b/poetry.lock index 4f54408c5..234dc429a 100644 --- a/poetry.lock +++ b/poetry.lock @@ -6600,13 +6600,13 @@ python-slugify = ">=1.2.5" [[package]] name = "python-semantic-release" -version = "8.5.1" +version = "8.7.0" description = "Automatic Semantic Versioning for Python projects" optional = false python-versions = ">=3.7" files = [ - {file = "python-semantic-release-8.5.1.tar.gz", hash = "sha256:43789d3c8df6f5b959ae69350c3d44e70a51228dc56875b09111874037f2bfd8"}, - {file = "python_semantic_release-8.5.1-py3-none-any.whl", hash = "sha256:eb96145e01a552399c13396e645de1e28c69480d7b275e72cda9caa09982f83a"}, + {file = "python-semantic-release-8.7.0.tar.gz", hash = "sha256:6bbd11b1e8ac70e0946ed6d257094c851b2507edfbc393eef6093d0ed1dbe0b4"}, + {file = "python_semantic_release-8.7.0-py3-none-any.whl", hash = "sha256:a016b1cf43a5f3667ce2cfddd8e30b6210a2d52b0e2f6b487aae1164f2540eaa"}, ] [package.dependencies] @@ -6623,7 +6623,7 @@ shellingham = ">=1.5.0.post1" tomlkit = ">=0.10,<1.0" [package.extras] -dev = ["pre-commit", "ruff (==0.1.6)", "tox"] +dev = ["pre-commit", "ruff (==0.1.8)", "tox"] docs = ["Sphinx (<=6.0.0)", "furo (>=2023.3.27)", "sphinx-autobuild (==2021.03.14)", "sphinxcontrib-apidoc (==0.3.0)"] mypy = ["mypy", "types-requests"] test = ["coverage[toml] (>=6,<8)", "pytest (>=7,<8)", "pytest-clarity (>=1.0.1)", "pytest-cov (>=4,<5)", "pytest-lazy-fixture (>=0.6.3,<0.7.0)", "pytest-mock (>=3,<4)", "pytest-pretty (>=1.2.0,<2)", "pytest-xdist (>=2,<4)", "requests-mock (>=1.10.0,<2)", "responses (==0.23.3)", "types-pytest-lazy-fixture (>=0.6.3.3)"] @@ -8485,4 +8485,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p [metadata] lock-version = "2.0" python-versions = "3.10.8" -content-hash = "771c0436f071cff05a0452e2152ee6dede822b73c18efca1d784e6c3ecb74bbe" +content-hash = "cf121a26dcefbf8b6639fbbda67cddad43232fc0ff5588ec58e25437c0b705af" diff --git a/pyproject.toml b/pyproject.toml index 5e3b0257a..0d498a034 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -73,7 +73,7 @@ apache-airflow-providers-google = "^10.12.0" pydoclint = "^0.3.8" prettier = "^0.0.7" deptry = "^0.12.0" -python-semantic-release = "^8.5.1" +python-semantic-release = "^8.7.0" yamllint = "^1.33.0" [tool.semantic_release] From 00972162470111c979142dcf20827013a1fd8949 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 2 Jan 2024 14:50:19 +0000 Subject: [PATCH 17/21] build(deps-dev): bump mkdocs-git-revision-date-localized-plugin (#376) Bumps [mkdocs-git-revision-date-localized-plugin](https://github.com/timvink/mkdocs-git-revision-date-localized-plugin) from 1.2.1 to 1.2.2. - [Release notes](https://github.com/timvink/mkdocs-git-revision-date-localized-plugin/releases) - [Commits](https://github.com/timvink/mkdocs-git-revision-date-localized-plugin/compare/v1.2.1...v1.2.2) --- updated-dependencies: - dependency-name: mkdocs-git-revision-date-localized-plugin dependency-type: direct:development update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- poetry.lock | 8 ++++---- pyproject.toml | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/poetry.lock b/poetry.lock index 234dc429a..0c3d58c6a 100644 --- a/poetry.lock +++ b/poetry.lock @@ -4826,13 +4826,13 @@ requests = "*" [[package]] name = "mkdocs-git-revision-date-localized-plugin" -version = "1.2.1" +version = "1.2.2" description = "Mkdocs plugin that enables displaying the localized date of the last git modification of a markdown file." optional = false python-versions = ">=3.6" files = [ - {file = "mkdocs-git-revision-date-localized-plugin-1.2.1.tar.gz", hash = "sha256:fc5b23a9d572cbba0114e9e17152001d01724990cb308830e58291fa614faf73"}, - {file = "mkdocs_git_revision_date_localized_plugin-1.2.1-py3-none-any.whl", hash = "sha256:d57dc99d67af917899e69c392f1ebccd1779fa243d641255469b03f8a3596b96"}, + {file = "mkdocs-git-revision-date-localized-plugin-1.2.2.tar.gz", hash = "sha256:0c43a9aac1fa69df99a823f833cc223bac9967b60d5261a857761c7c6e3b30de"}, + {file = "mkdocs_git_revision_date_localized_plugin-1.2.2-py3-none-any.whl", hash = "sha256:85c7fe9ab06e7a63c4e522c26fee8b51d357cb8cbe605064501ad80f4f31cb94"}, ] [package.dependencies] @@ -8485,4 +8485,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p [metadata] lock-version = "2.0" python-versions = "3.10.8" -content-hash = "cf121a26dcefbf8b6639fbbda67cddad43232fc0ff5588ec58e25437c0b705af" +content-hash = "94a4d406b3e2d5d4f574152356cf175741f4b1d190fd1a65d7428eaab1994be4" diff --git a/pyproject.toml b/pyproject.toml index 0d498a034..b68742ddb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -46,7 +46,7 @@ mkdocs = "^1.5.3" mkdocstrings-python = "^1.7.5" mkdocs-material = "*" mkdocs-section-index = "^0.3.4" -mkdocs-git-revision-date-localized-plugin = "^1.2.1" +mkdocs-git-revision-date-localized-plugin = "^1.2.2" mkdocs-autolinks-plugin = "^0.7.1" mkdocs-awesome-pages-plugin = "^2.9.2" mkdocs-exclude = "^1.0.2" From 8f962094dbb11282ee835a360b0b625dcb0d842e Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 2 Jan 2024 15:01:47 +0000 Subject: [PATCH 18/21] build(deps-dev): bump ipython from 8.18.1 to 8.19.0 (#377) Bumps [ipython](https://github.com/ipython/ipython) from 8.18.1 to 8.19.0. - [Release notes](https://github.com/ipython/ipython/releases) - [Commits](https://github.com/ipython/ipython/compare/8.18.1...8.19.0) --- updated-dependencies: - dependency-name: ipython dependency-type: direct:development update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- poetry.lock | 18 +++++++++--------- pyproject.toml | 2 +- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/poetry.lock b/poetry.lock index 0c3d58c6a..4a90e0186 100644 --- a/poetry.lock +++ b/poetry.lock @@ -3992,13 +3992,13 @@ test = ["flaky", "ipyparallel", "pre-commit", "pytest (>=7.0)", "pytest-asyncio" [[package]] name = "ipython" -version = "8.18.1" +version = "8.19.0" description = "IPython: Productive Interactive Computing" optional = false -python-versions = ">=3.9" +python-versions = ">=3.10" files = [ - {file = "ipython-8.18.1-py3-none-any.whl", hash = "sha256:e8267419d72d81955ec1177f8a29aaa90ac80ad647499201119e2f05e99aa397"}, - {file = "ipython-8.18.1.tar.gz", hash = "sha256:ca6f079bb33457c66e233e4580ebfc4128855b4cf6370dddd73842a9563e8a27"}, + {file = "ipython-8.19.0-py3-none-any.whl", hash = "sha256:2f55d59370f59d0d2b2212109fe0e6035cfea436b1c0e6150ad2244746272ec5"}, + {file = "ipython-8.19.0.tar.gz", hash = "sha256:ac4da4ecf0042fb4e0ce57c60430c2db3c719fa8bdf92f8631d6bd8a5785d1f0"}, ] [package.dependencies] @@ -4014,17 +4014,17 @@ stack-data = "*" traitlets = ">=5" [package.extras] -all = ["black", "curio", "docrepr", "exceptiongroup", "ipykernel", "ipyparallel", "ipywidgets", "matplotlib", "matplotlib (!=3.2.0)", "nbconvert", "nbformat", "notebook", "numpy (>=1.22)", "pandas", "pickleshare", "pytest (<7)", "pytest (<7.1)", "pytest-asyncio (<0.22)", "qtconsole", "setuptools (>=18.5)", "sphinx (>=1.3)", "sphinx-rtd-theme", "stack-data", "testpath", "trio", "typing-extensions"] +all = ["black", "curio", "docrepr", "exceptiongroup", "ipykernel", "ipyparallel", "ipywidgets", "matplotlib", "matplotlib (!=3.2.0)", "nbconvert", "nbformat", "notebook", "numpy (>=1.23)", "pandas", "pickleshare", "pytest", "pytest-asyncio (<0.22)", "qtconsole", "setuptools (>=18.5)", "sphinx (>=1.3)", "sphinx-rtd-theme", "stack-data", "testpath", "trio", "typing-extensions"] black = ["black"] -doc = ["docrepr", "exceptiongroup", "ipykernel", "matplotlib", "pickleshare", "pytest (<7)", "pytest (<7.1)", "pytest-asyncio (<0.22)", "setuptools (>=18.5)", "sphinx (>=1.3)", "sphinx-rtd-theme", "stack-data", "testpath", "typing-extensions"] +doc = ["docrepr", "exceptiongroup", "ipykernel", "matplotlib", "pickleshare", "pytest", "pytest-asyncio (<0.22)", "setuptools (>=18.5)", "sphinx (>=1.3)", "sphinx-rtd-theme", "stack-data", "testpath", "typing-extensions"] kernel = ["ipykernel"] nbconvert = ["nbconvert"] nbformat = ["nbformat"] notebook = ["ipywidgets", "notebook"] parallel = ["ipyparallel"] qtconsole = ["qtconsole"] -test = ["pickleshare", "pytest (<7.1)", "pytest-asyncio (<0.22)", "testpath"] -test-extra = ["curio", "matplotlib (!=3.2.0)", "nbformat", "numpy (>=1.22)", "pandas", "pickleshare", "pytest (<7.1)", "pytest-asyncio (<0.22)", "testpath", "trio"] +test = ["pickleshare", "pytest", "pytest-asyncio (<0.22)", "testpath"] +test-extra = ["curio", "matplotlib (!=3.2.0)", "nbformat", "numpy (>=1.23)", "pandas", "pickleshare", "pytest", "pytest-asyncio (<0.22)", "testpath", "trio"] [[package]] name = "isodate" @@ -8485,4 +8485,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p [metadata] lock-version = "2.0" python-versions = "3.10.8" -content-hash = "94a4d406b3e2d5d4f574152356cf175741f4b1d190fd1a65d7428eaab1994be4" +content-hash = "e445af79a0eb25f73de6c46dbf4ab39af61a1ebed2cc2fb40a90168cee1c525c" diff --git a/pyproject.toml b/pyproject.toml index b68742ddb..d6152994d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -65,7 +65,7 @@ pytest-xdist = "^3.5.0" [tool.poetry.group.dev.dependencies] -ipython = "^8.18.1" +ipython = "^8.19.0" ipykernel = "^6.27.1" google-cloud-dataproc = "^5.8.0" apache-airflow = "^2.8.0" From ee79424b2e8b71dd046a3c7267d645ef3b809e13 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Irene=20L=C3=B3pez?= <45119610+ireneisdoomed@users.noreply.github.com> Date: Thu, 4 Jan 2024 10:24:00 +0100 Subject: [PATCH 19/21] feat(study_locus): remove statistics after conditioning from schema (#383) --- src/otg/assets/schemas/study_locus.json | 24 ------------------------ tests/conftest.py | 2 +- 2 files changed, 1 insertion(+), 25 deletions(-) diff --git a/src/otg/assets/schemas/study_locus.json b/src/otg/assets/schemas/study_locus.json index 97f9da3bf..a48424722 100644 --- a/src/otg/assets/schemas/study_locus.json +++ b/src/otg/assets/schemas/study_locus.json @@ -164,18 +164,6 @@ "nullable": true, "type": "integer" }, - { - "metadata": {}, - "name": "pValueMantissaConditioned", - "nullable": true, - "type": "float" - }, - { - "metadata": {}, - "name": "pValueExponentConditioned", - "nullable": true, - "type": "integer" - }, { "metadata": {}, "name": "beta", @@ -188,18 +176,6 @@ "nullable": true, "type": "double" }, - { - "metadata": {}, - "name": "betaConditioned", - "nullable": true, - "type": "double" - }, - { - "metadata": {}, - "name": "standardErrorConditioned", - "nullable": true, - "type": "double" - }, { "metadata": {}, "name": "r2Overall", diff --git a/tests/conftest.py b/tests/conftest.py index f23a7949c..09991b4fd 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -192,7 +192,7 @@ def mock_study_locus_data(spark: SparkSession) -> DataFrame: .withColumnSpec("finemappingMethod", percentNulls=0.1) .withColumnSpec( "locus", - expr='array(named_struct("is95CredibleSet", cast(rand() > 0.5 as boolean), "is99CredibleSet", cast(rand() > 0.5 as boolean), "logABF", rand(), "posteriorProbability", rand(), "variantId", cast(rand() as string), "beta", rand(), "standardError", rand(), "betaConditioned", rand(), "standardErrorConditioned", rand(), "r2Overall", rand(), "pValueMantissaConditioned", rand(), "pValueExponentConditioned", rand(), "pValueMantissa", rand(), "pValueExponent", rand()))', + expr='array(named_struct("is95CredibleSet", cast(rand() > 0.5 as boolean), "is99CredibleSet", cast(rand() > 0.5 as boolean), "logABF", rand(), "posteriorProbability", rand(), "variantId", cast(rand() as string), "beta", rand(), "standardError", rand(), "r2Overall", rand(), "pValueMantissa", rand(), "pValueExponent", rand()))', percentNulls=0.1, ) ) From 212198250e7d6a0a0ff24dcb1f356fbf4ac2e1b1 Mon Sep 17 00:00:00 2001 From: David Ochoa Date: Thu, 4 Jan 2024 16:20:57 +0000 Subject: [PATCH 20/21] feat: carma outlier detection method (#281) * feat: first draft of carma outlier detection Co-authored-by: Yakov Tsepilov * docs: add carma docs page Co-authored-by: Yakov Tsepilov * test: example of test Co-authored-by: Yakov Tsepilov * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * feat: adding tests to CARMA funcs v1 * feat: adding pandas as dependancy * feat: fixing tests for CARMA misc functions * feat: small fixes in tests for CARMA misc functions * feat: adding tests and data for tests for main carma functions * feat: small tweak of test settings for CARMA * feat: added documentation * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * feat: adding more tests to set_gamma_func_base * fix: updating poetry.lock * feat: Update src/otg/method/carma.py Co-authored-by: Kirill Tsukanov * feat: Update src/otg/method/carma.py Co-authored-by: Kirill Tsukanov * feat: Update src/otg/method/carma.py Co-authored-by: Kirill Tsukanov * feat: Update src/otg/method/carma.py Co-authored-by: Kirill Tsukanov --------- Co-authored-by: Yakov Tsepilov Co-authored-by: Yakov Tsepilov Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Daniel Suveges Co-authored-by: Kirill Tsukanov --- docs/python_api/method/carma.md | 20 + poetry.lock | 58 +- pyproject.toml | 1 + src/otg/method/carma.py | 871 ++++++++++++++++++++++++++++ tests/conftest.py | 14 + tests/data_samples/01_test_PIPs.txt | 22 + tests/data_samples/01_test_ld.csv | 21 + tests/data_samples/01_test_z.csv | 22 + tests/method/test_carma.py | 56 ++ 9 files changed, 1056 insertions(+), 29 deletions(-) create mode 100644 docs/python_api/method/carma.md create mode 100644 src/otg/method/carma.py create mode 100644 tests/data_samples/01_test_PIPs.txt create mode 100644 tests/data_samples/01_test_ld.csv create mode 100644 tests/data_samples/01_test_z.csv create mode 100644 tests/method/test_carma.py diff --git a/docs/python_api/method/carma.md b/docs/python_api/method/carma.md new file mode 100644 index 000000000..1a5c568f4 --- /dev/null +++ b/docs/python_api/method/carma.md @@ -0,0 +1,20 @@ +--- +title: CARMA +--- + +CARMA is the method of the fine-mapping and outlier detection, originally implemented in R ([CARMA on GitHub](https://github.com/ZikunY/CARMA)). + +The full repository for the reimplementation of CARMA in Python can be found [here](https://github.com/hlnicholls/carmapy/tree/0.1.0). + +This is a simplified version of CARMA with the following features: + +1. It uses only Spike-slab effect size priors and Poisson model priors. +2. C++ is re-implemented in Python. +3. The way of storing the configuration list is changed. It uses a string with the list of indexes for causal SNPs instead of a sparse matrix. +4. Fixed bugs in PIP calculation. +5. No credible models. +6. No credible sets, only PIPs. +7. No functional annotations. +8. Removed unnecessary parameters. + +:::otg.method.carma.CARMA diff --git a/poetry.lock b/poetry.lock index 4a90e0186..dbaa445af 100644 --- a/poetry.lock +++ b/poetry.lock @@ -5479,36 +5479,36 @@ files = [ [[package]] name = "pandas" -version = "2.1.2" +version = "2.1.4" description = "Powerful data structures for data analysis, time series, and statistics" optional = false python-versions = ">=3.9" files = [ - {file = "pandas-2.1.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:24057459f19db9ebb02984c6fdd164a970b31a95f38e4a49cf7615b36a1b532c"}, - {file = "pandas-2.1.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:a6cf8fcc8a63d333970b950a7331a30544cf59b1a97baf0a7409e09eafc1ac38"}, - {file = "pandas-2.1.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6ae6ffbd9d614c20d028c7117ee911fc4e266b4dca2065d5c5909e401f8ff683"}, - {file = "pandas-2.1.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eff794eeb7883c5aefb1ed572e7ff533ae779f6c6277849eab9e77986e352688"}, - {file = "pandas-2.1.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:02954e285e8e2f4006b6f22be6f0df1f1c3c97adbb7ed211c6b483426f20d5c8"}, - {file = "pandas-2.1.2-cp310-cp310-win_amd64.whl", hash = "sha256:5b40c9f494e1f27588c369b9e4a6ca19cd924b3a0e1ef9ef1a8e30a07a438f43"}, - {file = "pandas-2.1.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:08d287b68fd28906a94564f15118a7ca8c242e50ae7f8bd91130c362b2108a81"}, - {file = "pandas-2.1.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:bbd98dcdcd32f408947afdb3f7434fade6edd408c3077bbce7bd840d654d92c6"}, - {file = "pandas-2.1.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e90c95abb3285d06f6e4feedafc134306a8eced93cb78e08cf50e224d5ce22e2"}, - {file = "pandas-2.1.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:52867d69a54e71666cd184b04e839cff7dfc8ed0cd6b936995117fdae8790b69"}, - {file = "pandas-2.1.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:8d0382645ede2fde352da2a885aac28ec37d38587864c0689b4b2361d17b1d4c"}, - {file = "pandas-2.1.2-cp311-cp311-win_amd64.whl", hash = "sha256:65177d1c519b55e5b7f094c660ed357bb7d86e799686bb71653b8a4803d8ff0d"}, - {file = "pandas-2.1.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:5aa6b86802e8cf7716bf4b4b5a3c99b12d34e9c6a9d06dad254447a620437931"}, - {file = "pandas-2.1.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:d594e2ce51b8e0b4074e6644758865dc2bb13fd654450c1eae51201260a539f1"}, - {file = "pandas-2.1.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3223f997b6d2ebf9c010260cf3d889848a93f5d22bb4d14cd32638b3d8bba7ad"}, - {file = "pandas-2.1.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fc4944dc004ca6cc701dfa19afb8bdb26ad36b9bed5bcec617d2a11e9cae6902"}, - {file = "pandas-2.1.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:3f76280ce8ec216dde336e55b2b82e883401cf466da0fe3be317c03fb8ee7c7d"}, - {file = "pandas-2.1.2-cp312-cp312-win_amd64.whl", hash = "sha256:7ad20d24acf3a0042512b7e8d8fdc2e827126ed519d6bd1ed8e6c14ec8a2c813"}, - {file = "pandas-2.1.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:021f09c15e1381e202d95d4a21ece8e7f2bf1388b6d7e9cae09dfe27bd2043d1"}, - {file = "pandas-2.1.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:e7f12b2de0060b0b858cfec0016e7d980ae5bae455a1746bfcc70929100ee633"}, - {file = "pandas-2.1.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:83c166b9bb27c1715bed94495d9598a7f02950b4749dba9349c1dd2cbf10729d"}, - {file = "pandas-2.1.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:25c9976c17311388fcd953cb3d0697999b2205333f4e11e669d90ff8d830d429"}, - {file = "pandas-2.1.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:851b5afbb0d62f6129ae891b533aa508cc357d5892c240c91933d945fff15731"}, - {file = "pandas-2.1.2-cp39-cp39-win_amd64.whl", hash = "sha256:e78507adcc730533619de07bfdd1c62b2918a68cd4419ea386e28abf7f6a1e5c"}, - {file = "pandas-2.1.2.tar.gz", hash = "sha256:52897edc2774d2779fbeb6880d2cfb305daa0b1a29c16b91f531a18918a6e0f3"}, + {file = "pandas-2.1.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:bdec823dc6ec53f7a6339a0e34c68b144a7a1fd28d80c260534c39c62c5bf8c9"}, + {file = "pandas-2.1.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:294d96cfaf28d688f30c918a765ea2ae2e0e71d3536754f4b6de0ea4a496d034"}, + {file = "pandas-2.1.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6b728fb8deba8905b319f96447a27033969f3ea1fea09d07d296c9030ab2ed1d"}, + {file = "pandas-2.1.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:00028e6737c594feac3c2df15636d73ace46b8314d236100b57ed7e4b9ebe8d9"}, + {file = "pandas-2.1.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:426dc0f1b187523c4db06f96fb5c8d1a845e259c99bda74f7de97bd8a3bb3139"}, + {file = "pandas-2.1.4-cp310-cp310-win_amd64.whl", hash = "sha256:f237e6ca6421265643608813ce9793610ad09b40154a3344a088159590469e46"}, + {file = "pandas-2.1.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b7d852d16c270e4331f6f59b3e9aa23f935f5c4b0ed2d0bc77637a8890a5d092"}, + {file = "pandas-2.1.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:bd7d5f2f54f78164b3d7a40f33bf79a74cdee72c31affec86bfcabe7e0789821"}, + {file = "pandas-2.1.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0aa6e92e639da0d6e2017d9ccff563222f4eb31e4b2c3cf32a2a392fc3103c0d"}, + {file = "pandas-2.1.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d797591b6846b9db79e65dc2d0d48e61f7db8d10b2a9480b4e3faaddc421a171"}, + {file = "pandas-2.1.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:d2d3e7b00f703aea3945995ee63375c61b2e6aa5aa7871c5d622870e5e137623"}, + {file = "pandas-2.1.4-cp311-cp311-win_amd64.whl", hash = "sha256:dc9bf7ade01143cddc0074aa6995edd05323974e6e40d9dbde081021ded8510e"}, + {file = "pandas-2.1.4-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:482d5076e1791777e1571f2e2d789e940dedd927325cc3cb6d0800c6304082f6"}, + {file = "pandas-2.1.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8a706cfe7955c4ca59af8c7a0517370eafbd98593155b48f10f9811da440248b"}, + {file = "pandas-2.1.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b0513a132a15977b4a5b89aabd304647919bc2169eac4c8536afb29c07c23540"}, + {file = "pandas-2.1.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e9f17f2b6fc076b2a0078862547595d66244db0f41bf79fc5f64a5c4d635bead"}, + {file = "pandas-2.1.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:45d63d2a9b1b37fa6c84a68ba2422dc9ed018bdaa668c7f47566a01188ceeec1"}, + {file = "pandas-2.1.4-cp312-cp312-win_amd64.whl", hash = "sha256:f69b0c9bb174a2342818d3e2778584e18c740d56857fc5cdb944ec8bbe4082cf"}, + {file = "pandas-2.1.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:3f06bda01a143020bad20f7a85dd5f4a1600112145f126bc9e3e42077c24ef34"}, + {file = "pandas-2.1.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ab5796839eb1fd62a39eec2916d3e979ec3130509930fea17fe6f81e18108f6a"}, + {file = "pandas-2.1.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:edbaf9e8d3a63a9276d707b4d25930a262341bca9874fcb22eff5e3da5394732"}, + {file = "pandas-2.1.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1ebfd771110b50055712b3b711b51bee5d50135429364d0498e1213a7adc2be8"}, + {file = "pandas-2.1.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:8ea107e0be2aba1da619cc6ba3f999b2bfc9669a83554b1904ce3dd9507f0860"}, + {file = "pandas-2.1.4-cp39-cp39-win_amd64.whl", hash = "sha256:d65148b14788b3758daf57bf42725caa536575da2b64df9964c563b015230984"}, + {file = "pandas-2.1.4.tar.gz", hash = "sha256:fcb68203c833cc735321512e13861358079a96c174a61f5116a1de89c58c0ef7"}, ] [package.dependencies] @@ -5518,7 +5518,7 @@ pytz = ">=2020.1" tzdata = ">=2022.1" [package.extras] -all = ["PyQt5 (>=5.15.6)", "SQLAlchemy (>=1.4.36)", "beautifulsoup4 (>=4.11.1)", "bottleneck (>=1.3.4)", "dataframe-api-compat (>=0.1.7)", "fastparquet (>=0.8.1)", "fsspec (>=2022.05.0)", "gcsfs (>=2022.05.0)", "html5lib (>=1.1)", "hypothesis (>=6.46.1)", "jinja2 (>=3.1.2)", "lxml (>=4.8.0)", "matplotlib (>=3.6.1)", "numba (>=0.55.2)", "numexpr (>=2.8.0)", "odfpy (>=1.4.1)", "openpyxl (>=3.0.10)", "pandas-gbq (>=0.17.5)", "psycopg2 (>=2.9.3)", "pyarrow (>=7.0.0)", "pymysql (>=1.0.2)", "pyreadstat (>=1.1.5)", "pytest (>=7.3.2)", "pytest-asyncio (>=0.17.0)", "pytest-xdist (>=2.2.0)", "pyxlsb (>=1.0.9)", "qtpy (>=2.2.0)", "s3fs (>=2022.05.0)", "scipy (>=1.8.1)", "tables (>=3.7.0)", "tabulate (>=0.8.10)", "xarray (>=2022.03.0)", "xlrd (>=2.0.1)", "xlsxwriter (>=3.0.3)", "zstandard (>=0.17.0)"] +all = ["PyQt5 (>=5.15.6)", "SQLAlchemy (>=1.4.36)", "beautifulsoup4 (>=4.11.1)", "bottleneck (>=1.3.4)", "dataframe-api-compat (>=0.1.7)", "fastparquet (>=0.8.1)", "fsspec (>=2022.05.0)", "gcsfs (>=2022.05.0)", "html5lib (>=1.1)", "hypothesis (>=6.46.1)", "jinja2 (>=3.1.2)", "lxml (>=4.8.0)", "matplotlib (>=3.6.1)", "numba (>=0.55.2)", "numexpr (>=2.8.0)", "odfpy (>=1.4.1)", "openpyxl (>=3.0.10)", "pandas-gbq (>=0.17.5)", "psycopg2 (>=2.9.3)", "pyarrow (>=7.0.0)", "pymysql (>=1.0.2)", "pyreadstat (>=1.1.5)", "pytest (>=7.3.2)", "pytest-xdist (>=2.2.0)", "pyxlsb (>=1.0.9)", "qtpy (>=2.2.0)", "s3fs (>=2022.05.0)", "scipy (>=1.8.1)", "tables (>=3.7.0)", "tabulate (>=0.8.10)", "xarray (>=2022.03.0)", "xlrd (>=2.0.1)", "xlsxwriter (>=3.0.3)", "zstandard (>=0.17.0)"] aws = ["s3fs (>=2022.05.0)"] clipboard = ["PyQt5 (>=5.15.6)", "qtpy (>=2.2.0)"] compression = ["zstandard (>=0.17.0)"] @@ -5538,7 +5538,7 @@ plot = ["matplotlib (>=3.6.1)"] postgresql = ["SQLAlchemy (>=1.4.36)", "psycopg2 (>=2.9.3)"] spss = ["pyreadstat (>=1.1.5)"] sql-other = ["SQLAlchemy (>=1.4.36)"] -test = ["hypothesis (>=6.46.1)", "pytest (>=7.3.2)", "pytest-asyncio (>=0.17.0)", "pytest-xdist (>=2.2.0)"] +test = ["hypothesis (>=6.46.1)", "pytest (>=7.3.2)", "pytest-xdist (>=2.2.0)"] xml = ["lxml (>=4.8.0)"] [[package]] @@ -8485,4 +8485,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p [metadata] lock-version = "2.0" python-versions = "3.10.8" -content-hash = "e445af79a0eb25f73de6c46dbf4ab39af61a1ebed2cc2fb40a90168cee1c525c" +content-hash = "8132758e0409632a8e8448a97e79670bcdae63c72ce60c94e16e904e8f30a4cd" diff --git a/pyproject.toml b/pyproject.toml index d6152994d..020324f24 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,6 +31,7 @@ google = "^3.0.0" omegaconf = "^2.3.0" typing-extensions = "^4.9.0" scikit-learn = "^1.3.2" +pandas = "^2.1.4" [tool.poetry.dev-dependencies] pre-commit = "^3.6.0" diff --git a/src/otg/method/carma.py b/src/otg/method/carma.py new file mode 100644 index 000000000..85722d24d --- /dev/null +++ b/src/otg/method/carma.py @@ -0,0 +1,871 @@ +"""CARMA outlier detection method.""" +from __future__ import annotations + +from itertools import combinations +from math import floor, lgamma +from typing import Any + +import numpy as np +import pandas as pd +from scipy.linalg import det, inv, pinv +from scipy.optimize import minimize_scalar + + +class CARMA: + """Implementation of CARMA outlier detection method.""" + + @staticmethod + def CARMA_spike_slab_noEM( + z: np.ndarray, + ld: np.ndarray, + lambda_val: float = 1, + Max_Model_Dim: int = 200_000, + all_iter: int = 1, + all_inner_iter: int = 10, + epsilon_threshold: float = 1e-5, + num_causal: int = 10, + tau: float = 0.04, + outlier_switch: bool = True, + outlier_BF_index: float = 1 / 3.2, + ) -> dict[str, Any]: + """Perform CARMA analysis using a Spike-and-Slab prior without Expectation-Maximization (EM). + + Args: + z (np.ndarray): Numeric vector representing z-scores. + ld (np.ndarray): Numeric matrix representing the linkage disequilibrium (LD) matrix. + lambda_val (float): Regularization parameter controlling the strength of the L1 penalty. + Max_Model_Dim (int): Maximum allowed dimension for the causal models. + all_iter (int): The total number of iterations to run the CARMA analysis. + all_inner_iter (int): The number of inner iterations in each CARMA iteration. + epsilon_threshold (float): Threshold for convergence in CARMA iterations. + num_causal (int): Maximal number of causal variants to be selected in the final model. + tau (float): Tuning parameter controlling the degree of sparsity in the Spike-and-Slab prior. + outlier_switch (bool): Whether to consider outlier detection in the analysis. + outlier_BF_index (float): Bayes Factor threshold for identifying outliers. + + Returns: + dict[str, Any]: A dictionary containing the following results: + - PIPs: A numeric vector of posterior inclusion probabilities (PIPs) for all SNPs. + - B_list: A dataframe containing the marginal likelihoods and the corresponding model space. + - Outliers: A list of outlier SNPs. + """ + p_snp = len(z) + epsilon_list = epsilon_threshold * p_snp + all_epsilon_threshold = epsilon_threshold * p_snp + + # Zero step + all_C_list = CARMA._MCS_modified( + z=z, + ld_matrix=ld, + epsilon=epsilon_list, + Max_Model_Dim=Max_Model_Dim, + lambda_val=lambda_val, + outlier_switch=outlier_switch, + tau=tau, + num_causal=num_causal, + inner_all_iter=all_inner_iter, + outlier_BF_index=outlier_BF_index, + ) + + # Main steps + for _ in range(0, all_iter): + ac1 = all_C_list["B_list"]["set_gamma_margin"] + previous_result = np.mean(ac1[0 : round(len(ac1) / 4)]) + + all_C_list = CARMA._MCS_modified( + z=z, + ld_matrix=ld, + input_conditional_S_list=all_C_list["conditional_S_list"], + Max_Model_Dim=Max_Model_Dim, + num_causal=num_causal, + epsilon=epsilon_list, + outlier_switch=outlier_switch, + tau=tau, + lambda_val=lambda_val, + inner_all_iter=all_inner_iter, + outlier_BF_index=outlier_BF_index, + ) + + ac1 = all_C_list["B_list"]["set_gamma_margin"] + difference = np.abs(previous_result - np.mean(ac1[0 : round(len(ac1) / 4)])) + if difference < all_epsilon_threshold: + break + + # Calculate PIPs and Credible Set + pip = CARMA._PIP_func( + likeli=all_C_list["B_list"]["set_gamma_margin"], + model_space=all_C_list["B_list"]["matrix_gamma"], + p=p_snp, + num_causal=num_causal, + ) + + results_list = { + "PIPs": pip, + "B_list": all_C_list["B_list"], + "Outliers": all_C_list["conditional_S_list"], + } + + + return results_list + + @staticmethod + def _ind_normal_sigma_fixed_marginal_fun_indi( + zSigmaz_S: np.ndarray, tau: float, p_S: int, det_S: float + ) -> float: + """Internal function for calculating the marginal likelihood of configuration model. + + Args: + zSigmaz_S (np.ndarray): The zSigmaz_S value. + tau (float): The tau value. + p_S (int): The number of SNPs. + det_S (float): The det_S value. + + Returns: + float: The marginal likelihood of a model. + + Examples: + >>> zSigmaz_S = 0.1 + >>> tau = 1 / 0.05**2 + >>> p_S = 3 + >>> det_S = 0.1 + >>> np.round(CARMA._ind_normal_sigma_fixed_marginal_fun_indi(zSigmaz_S, tau, p_S, det_S),decimals=5) + 10.18849 + """ + return p_S / 2.0 * np.log(tau) - 0.5 * np.log(det_S) + zSigmaz_S / 2.0 + + @staticmethod + def _ind_Normal_fixed_sigma_marginal_external( + index_vec_input: np.ndarray, + Sigma: np.ndarray, + z: np.ndarray, + tau: float, + p_S: int, + ) -> float: + """Marginal likelihood of configuration model. + + Args: + index_vec_input (np.ndarray): The index vector. + Sigma (np.ndarray): The Sigma matrix. + z (np.ndarray): The z vector. + tau (float): The tau value. + p_S (int): The number of SNPs. + + Returns: + float: The marginal likelihood of a model. + + Examples: + >>> index_vec_input = np.array([1, 2]) + >>> Sigma = np.array([[1, 0.5, 0.2], [0.5, 1, 0.3], [0.2, 0.3, 1]]) + >>> z = np.array([10, 11, 10]) + >>> tau = 1 + >>> p_S = 2 + >>> np.round(CARMA._ind_Normal_fixed_sigma_marginal_external(index_vec_input, Sigma, z, tau, p_S),decimals=5) + 43.60579 + """ + index_vec = index_vec_input - 1 + Sigma_S = Sigma[np.ix_(index_vec, index_vec)] + A = tau * np.eye(p_S) + + det_S = det(Sigma_S + A) + Sigma_S_inv = inv(Sigma_S + A) + + sub_z = z[index_vec] + zSigmaz_S = np.dot(sub_z.T, np.dot(Sigma_S_inv, sub_z)) + + b = CARMA._ind_normal_sigma_fixed_marginal_fun_indi(zSigmaz_S, tau, p_S, det_S) + + results = b + + return results + + @staticmethod + def _outlier_ind_Normal_marginal_external( + index_vec_input: np.ndarray, + Sigma: np.ndarray, + z: np.ndarray, + tau: float, + p_S: int, + ) -> float: + """Likehood of outlier model. + + Args: + index_vec_input (np.ndarray): The index vector. + Sigma (np.ndarray): The Sigma matrix. + z (np.ndarray): The z vector. + tau (float): The tau value. + p_S (int): The number of SNPs. + + Returns: + float: The likelihood of a model. + + Examples: + >>> index_vec_input = np.array([1, 2, 3]) + >>> Sigma = np.array([[1, 0.5, 0.2], [0.5, 1, 0.3], [0.2, 0.3, 1]]) + >>> z = np.array([0.1, 0.2, 0.3]) + >>> tau = 1 / 0.05**2 + >>> p_S = 3 + >>> np.round(CARMA._outlier_ind_Normal_marginal_external(index_vec_input, Sigma, z, tau, p_S),decimals=5) + -8.8497 + """ + index_vec = index_vec_input - 1 + + Sigma_S = Sigma[np.ix_(index_vec, index_vec)] + A = tau * np.eye(p_S) + + Sigma_S_I_inv = pinv(Sigma_S + A, rtol=0.00001) + Sigma_S_inv = pinv(Sigma_S, rtol=0.00001) + + det_S = np.abs(det(Sigma_S_inv)) + det_I_S = np.abs(det(Sigma_S_I_inv)) + + sub_z = z[index_vec] + zSigmaz_S = np.dot(sub_z, np.dot(Sigma_S_inv, sub_z)) + zSigmaz_I_S = np.dot(sub_z, np.dot(Sigma_S_I_inv, sub_z)) + + b = 0.5 * (np.log(det_S) + np.log(det_I_S)) - 0.5 * (zSigmaz_S - zSigmaz_I_S) + results = b + + return results + + @staticmethod + def _add_function(S_sub: np.ndarray, y: Any) -> np.ndarray: + """Concatenate two arrays and sort the result. + + Args: + S_sub (np.ndarray): The first array. + y (Any): The second array. + + Returns: + np.ndarray: The concatenated and sorted array. + + Examples: + >>> S_sub = np.array([3, 4]) + >>> y = np.array([1, 2]) + >>> CARMA._add_function(S_sub, y) + array([[1, 2, 3], + [1, 2, 4]]) + """ + return np.array([np.sort(np.concatenate(([x], y))) for x in S_sub]) + + @staticmethod + def _set_gamma_func_base(S: Any, p: int) -> dict[int, np.ndarray]: + """Creates a dictionary of sets of configurations assuming no conditional set. + + Args: + S (Any): The input set. + p (int): The number of SNPs. + + Returns: + dict[int, np.ndarray]: A dictionary of sets of configurations. + + Examples: + >>> S = [0,1] + >>> p = 4 + >>> CARMA._set_gamma_func_base(S, p) + {0: array([[0], + [1]]), 1: array([[0, 1, 2], + [0, 1, 3]]), 2: array([[0, 2], + [0, 3], + [1, 2], + [1, 3]])} + + >>> S = [0] + >>> p = 2 + >>> CARMA._set_gamma_func_base(S, p) + {0: None, 1: array([[0, 1]]), 2: array([[1]])} + + >>> S = [] + >>> p = 2 + >>> CARMA._set_gamma_func_base(S, p) + {0: None, 1: array([[0], + [1]]), 2: None} + """ + set_gamma: dict[int, Any] = {} + + if len(S) == 0: + set_gamma[0] = None + set_gamma[1] = np.arange(0, p).reshape(-1, 1) + set_gamma[2] = None + + if len(S) == 1: + S_sub = np.setdiff1d(np.arange(0, p), S) + set_gamma[0] = None + set_gamma[1] = CARMA._add_function(S_sub, S) + set_gamma[2] = S_sub.reshape(-1, 1) + + if len(S) > 1: + S_sub = np.setdiff1d(np.arange(0, p), S) + S = np.sort(S) + set_gamma[0] = np.array(list(combinations(S, len(S) - 1))) + set_gamma[1] = CARMA._add_function(S_sub, S) + xs = np.vstack([CARMA._add_function(S_sub, row) for row in set_gamma[0]]) + set_gamma[2] = xs + + return set_gamma + + @staticmethod + def _set_gamma_func_conditional( + input_S: Any, condition_index: list[int], p: int + ) -> dict[int, np.ndarray]: + """Creates a dictionary of sets of configurations assuming conditional set. + + Args: + input_S (Any): The input set. + condition_index (list[int]): The conditional set. + p (int): The number of SNPs. + + Returns: + dict[int, np.ndarray]: A dictionary of sets of configurations. + + Examples: + >>> input_S = [0,1,2] + >>> condition_index = [2] + >>> p = 4 + >>> CARMA._set_gamma_func_conditional(input_S, condition_index, p) + {0: array([[0], + [1]]), 1: array([[0, 1, 3]]), 2: array([[0, 3], + [1, 3]])} + """ + set_gamma: dict[int, Any] = {} + S = np.setdiff1d(input_S, condition_index) + + # set of gamma- + if len(S) == 0: + S_sub = np.setdiff1d(np.arange(0, p), condition_index) + set_gamma[0] = None + set_gamma[1] = S_sub.reshape(-1, 1) + set_gamma[2] = None + + if len(S) == 1: + S_sub = np.setdiff1d(np.arange(0, p), input_S) + set_gamma[0] = None + set_gamma[1] = CARMA._add_function(S_sub, S) + set_gamma[2] = S_sub.reshape(-1, 1) + + if len(S) > 1: + S_sub = np.setdiff1d(np.arange(0, p), input_S) + S = np.sort(S) + set_gamma[0] = np.array(list(combinations(S, len(S) - 1))) + set_gamma[1] = CARMA._add_function(S_sub, S) + xs = np.vstack([CARMA._add_function(S_sub, row) for row in set_gamma[0]]) + set_gamma[2] = xs + + return set_gamma + + @staticmethod + def _set_gamma_func( + input_S: Any, p: int, condition_index: list[int] | None = None + ) -> dict[int, np.ndarray]: + """Creates a dictionary of sets of configurations. + + Args: + input_S (Any): The input set. + p (int): The number of SNPs. + condition_index (list[int] | None): The conditional set. Defaults to None. + + Returns: + dict[int, np.ndarray]: A dictionary of sets of configurations. + + Examples: + >>> input_S = [0,1,2] + >>> condition_index=[2] + >>> p = 4 + >>> CARMA._set_gamma_func(input_S, p, condition_index) + {0: array([[0], + [1]]), 1: array([[0, 1, 3]]), 2: array([[0, 3], + [1, 3]])} + """ + if condition_index is None: + results = CARMA._set_gamma_func_base(input_S, p) + else: + results = CARMA._set_gamma_func_conditional(input_S, condition_index, p) + return results + + @staticmethod + def _index_fun_internal(x: np.ndarray) -> str: + """Convert an array of causal SNP indexes to comma-separated string. + + Args: + x (np.ndarray): The input array. + + Returns: + str: The comma-separated string. + + Examples: + >>> x = np.array([1,2,3]) + >>> CARMA._index_fun_internal(x) + '1,2,3' + """ + y = np.sort(x) + y = y.astype(str) + return ",".join(y) + + @staticmethod + def _index_fun(y: np.ndarray) -> np.ndarray: + """Convert an array of causal SNP indexes to comma-separated string. + + Args: + y (np.ndarray): The input array. + + Returns: + np.ndarray: The comma-separated string. + + Examples: + >>> y = np.array([[1,2,3],[4,5,6]]) + >>> CARMA._index_fun(y) + array(['1,2,3', '4,5,6'], dtype=' float: + """Estimate the matrix shrinkage parameter for outlier detection. + + Args: + x (float): The input parameter. + Sigma (np.ndarray): The Sigma matrix. + modi_ld_S (np.ndarray): The modi_ld_S matrix. + test_S (np.ndarray): The test_S matrix. + z (np.ndarray): The z vector. + outlier_tau (float): The outlier_tau value. + outlier_likelihood (Any): The outlier_likelihood function. + + Returns: + float: The estimated matrix shrinkage parameter. + + Examples: + >>> x = 0.5 + >>> Sigma = np.array([[1, 0.5, 0.2], [0.5, 1, 0.3], [0.2, 0.3, 1]]) + >>> modi_ld_S = np.array([[1, 0.5], [0.5, 1]]) + >>> test_S = np.array([1, 2]) + >>> z = np.array([0.1, 0.2, 0.3]) + >>> outlier_tau = 1 / 0.05**2 + >>> outlier_likelihood = CARMA._outlier_ind_Normal_marginal_external + >>> np.round(CARMA._ridge_fun(x, Sigma, modi_ld_S, test_S, z, outlier_tau, outlier_likelihood),decimals=5) + 6.01486 + """ + temp_Sigma = Sigma.copy() + temp_ld_S = x * modi_ld_S + (1 - x) * np.eye(len(modi_ld_S)) + temp_Sigma[np.ix_(test_S, test_S)] = temp_ld_S + return -outlier_likelihood( + index_vec_input=test_S + 1, + Sigma=temp_Sigma, + z=z, + tau=outlier_tau, + p_S=len(test_S), + ) + + @staticmethod + def _prior_dist(t: str, lambda_val: float, p: int) -> float: + """Estimate the priors for the given configurations. + + Args: + t (str): The input string for the given configuration. + lambda_val (float): The lambda value. + p (int): The number of SNPs. + + Returns: + float: The estimated prior. + + Examples: + >>> t = "1,2,3" + >>> lambda_val = 1 + >>> p = 4 + >>> np.round(CARMA._prior_dist(t, lambda_val, p),decimals=5) + -3.17805 + """ + index_array = t.split(",") + dim_model = len(index_array) + if t == "": + dim_model = 0 + return ( + dim_model * np.log(lambda_val) + lgamma(p - dim_model + 1) - lgamma(p + 1) + ) + + @staticmethod + def _PIP_func( + likeli: pd.DataFrame, model_space: pd.DataFrame, p: int, num_causal: int + ) -> np.ndarray: + """Estimates the posterior inclusion probabilities (PIPs) for all SNPs. + + Args: + likeli (pd.DataFrame): The marginal likelihoods. + model_space (pd.DataFrame): The corresponding model space. + p (int): The number of SNPs. + num_causal (int): The maximal number of causal SNPs. + + Returns: + np.ndarray: The posterior inclusion probabilities (PIPs) for all SNPs. + + Examples: + >>> likeli = pd.DataFrame([10, 10, 5,11,0], columns=['likeli']).squeeze() + >>> model_space = pd.DataFrame(['0', '1', '2','0,1',''], columns=['config']).squeeze() + >>> p = 3 + >>> num_causal = 2 + >>> CARMA._PIP_func(likeli, model_space, p, num_causal) + array([0.7869271, 0.7869271, 0.001426 ]) + """ + likeli = likeli.reset_index(drop=True) + model_space = model_space.reset_index(drop=True) + + model_space_matrix = np.zeros((len(model_space), p), dtype=int) + + for i in range(len(model_space)): + if model_space.iloc[i] != "": + ind = list(map(int, model_space.iloc[i].split(","))) + if len(ind) > 0: + model_space_matrix[i, ind] = 1 + + infi_index = np.where(np.isinf(likeli))[0] + if len(infi_index) != 0: + likeli = likeli.drop(infi_index).reset_index(drop=True) + model_space_matrix = np.delete(model_space_matrix, infi_index, axis=0) + + na_index = np.where(np.isnan(likeli))[0] + if len(na_index) != 0: + likeli = likeli.drop(na_index).reset_index(drop=True) + model_space_matrix = np.delete(model_space_matrix, na_index, axis=0) + + row_sums = np.sum(model_space_matrix, axis=1) + model_space_matrix = model_space_matrix[row_sums <= num_causal] + likeli = likeli[row_sums <= num_causal] + + aa = likeli - max(likeli) + prob_sum = np.sum(np.exp(aa)) + + result_prob = np.zeros(p) + for i in range(p): + result_prob[i] = ( + np.sum(np.exp(aa[model_space_matrix[:, i] == 1])) / prob_sum + ) + + return result_prob + + @staticmethod + def _MCS_modified( # noqa: C901 + z: np.ndarray, + ld_matrix: np.ndarray, + Max_Model_Dim: int = 10_000, + lambda_val: float = 1, + num_causal: int = 10, + outlier_switch: bool = True, + input_conditional_S_list: list[int] | None = None, + tau: float = 1 / 0.05**2, + epsilon: float = 1e-3, + inner_all_iter: int = 10, + outlier_BF_index: float | None = None, + ) -> dict[str, Any]: + """Modified Monte Carlo shotgun sampling (MCS) algorithm. + + Args: + z (np.ndarray): Numeric vector representing z-scores. + ld_matrix (np.ndarray): Numeric matrix representing the linkage disequilibrium (LD) matrix. + Max_Model_Dim (int): Maximum allowed dimension for the causal models. + lambda_val (float): Regularization parameter controlling the strength of the L1 penalty. + num_causal (int): Maximal number of causal variants to be selected in the final model. + outlier_switch (bool): Whether to consider outlier detection in the analysis. + input_conditional_S_list (list[int] | None): The conditional set. Defaults to None. + tau (float): Tuning parameter controlling the degree of sparsity in the Spike-and-Slab prior. + epsilon (float): Threshold for convergence in CARMA iterations. + inner_all_iter (int): The number of inner iterations in each CARMA iteration. + outlier_BF_index (float | None): Bayes Factor threshold for identifying outliers. Defaults to None. + + Returns: + dict[str, Any]: A dictionary containing the following results: + - B_list: A dataframe containing the marginal likelihoods and the corresponding model space. + - conditional_S_list: A list of outliers. + + Examples: + >>> z = np.array([0.1, 0.2, 0.3]) + >>> ld_matrix = np.array([[1, 0.5, 0.2], [0.5, 1, 0.3], [0.2, 0.3, 1]]) + >>> Max_Model_Dim = 10_000 + >>> lambda_val = 1 + + >>> num_causal = 10 + >>> outlier_switch = True + """ + p = len(z) + marginal_likelihood = CARMA._ind_Normal_fixed_sigma_marginal_external + tau_sample = tau + if outlier_switch: + outlier_likelihood = CARMA._outlier_ind_Normal_marginal_external + outlier_tau = tau + + B = Max_Model_Dim + stored_bf = 0 + Sigma = ld_matrix + + S = [] + + null_model = "" + null_margin = CARMA._prior_dist(null_model, lambda_val=lambda_val, p=p) + + B_list = pd.DataFrame({"set_gamma_margin": [null_margin], "matrix_gamma": [""]}) + + if input_conditional_S_list is None: + conditional_S = [] + else: + conditional_S = input_conditional_S_list + S = conditional_S + + for _i in range(0, inner_all_iter): + for _j in range(0, 10): + set_gamma = CARMA._set_gamma_func( + input_S=S, p=p, condition_index=conditional_S + ) + + if conditional_S is None: + working_S = S + else: + working_S = np.sort(np.setdiff1d(S, conditional_S)).astype(int) + + set_gamma_margin: list[Any] = [None, None, None] + set_gamma_prior: list[Any] = [None, None, None] + matrix_gamma: list[Any] = [None, None, None] + + for i in range(0, len(set_gamma)): + if set_gamma[i] is not None: + matrix_gamma[i] = CARMA._index_fun(set_gamma[i]) + p_S = set_gamma[i].shape[1] + set_gamma_margin[i] = np.apply_along_axis( + marginal_likelihood, + 1, + set_gamma[i] + 1, + Sigma=Sigma, + z=z, + tau=tau_sample, + p_S=p_S, + ) + set_gamma_prior[i] = np.array( + [ + CARMA._prior_dist(model, lambda_val=lambda_val, p=p) + for model in matrix_gamma[i] + ] + ) + set_gamma_margin[i] = set_gamma_prior[i] + set_gamma_margin[i] + else: + set_gamma_margin[i] = np.array(null_margin) + set_gamma_prior[i] = 0 + matrix_gamma[i] = np.array(null_model) + + columns = ["set_gamma_margin", "matrix_gamma"] + add_B = pd.DataFrame(columns=columns) + + for i in range(len(set_gamma)): + if isinstance(set_gamma_margin[i].tolist(), list): + new_row = pd.DataFrame( + { + "set_gamma_margin": set_gamma_margin[i].tolist(), + "matrix_gamma": matrix_gamma[i].tolist(), + } + ) + add_B = pd.concat([add_B, new_row], ignore_index=True) + else: + new_row = pd.DataFrame( + { + "set_gamma_margin": [set_gamma_margin[i].tolist()], + "matrix_gamma": [matrix_gamma[i].tolist()], + } + ) + add_B = pd.concat([add_B, new_row], ignore_index=True) + + # Add visited models into the storage space of models + B_list = pd.concat([B_list, add_B], ignore_index=True) + B_list = B_list.drop_duplicates( + subset="matrix_gamma", ignore_index=True + ) + B_list = B_list.sort_values( + by="set_gamma_margin", ignore_index=True, ascending=False + ) + + if len(working_S) == 0: + # Create a DataFrame set.star + set_star = pd.DataFrame( + { + "set_index": [0, 1, 2], + "gamma_set_index": [np.nan, np.nan, np.nan], + "margin": [np.nan, np.nan, np.nan], + } + ) + + # Assuming set.gamma.margin and current.log.margin are defined + aa = set_gamma_margin[1] + aa = aa - aa[np.argmax(aa)] + + min_half_len = min(len(aa), floor(p / 2)) + decr_ind = np.argsort(np.exp(aa))[::-1] + decr_half_ind = decr_ind[:min_half_len] + + probs = np.exp(aa)[decr_half_ind] + + chosen_index = np.random.choice( + decr_half_ind, 1, p=probs / np.sum(probs) + ) + set_star.at[1, "gamma_set_index"] = chosen_index[0] + set_star.at[1, "margin"] = set_gamma_margin[1][chosen_index[0]] + + S = set_gamma[1][chosen_index[0]].tolist() + + else: + set_star = pd.DataFrame( + { + "set_index": [0, 1, 2], + "gamma_set_index": [np.nan, np.nan, np.nan], + "margin": [np.nan, np.nan, np.nan], + } + ) + for i in range(0, 3): + aa = set_gamma_margin[i] + if np.size(aa) > 1: + aa = aa - aa[np.argmax(aa)] + chosen_index = np.random.choice( + range(0, np.size(set_gamma_margin[i])), + 1, + p=np.exp(aa) / np.sum(np.exp(aa)), + ) + set_star.at[i, "gamma_set_index"] = chosen_index + set_star.at[i, "margin"] = set_gamma_margin[i][chosen_index] + else: + set_star.at[i, "gamma_set_index"] = 0 + set_star.at[i, "margin"] = set_gamma_margin[i] + + if outlier_switch: + for i in range(1, len(set_gamma)): + test_log_BF: float = 100 + while True: + aa = set_gamma_margin[i] + aa = aa - aa[np.argmax(aa)] + chosen_index = np.random.choice( + range(0, np.size(set_gamma_margin[i])), + 1, + p=np.exp(aa) / np.sum(np.exp(aa)), + ) + set_star.at[i, "gamma_set_index"] = chosen_index + set_star.at[i, "margin"] = set_gamma_margin[i][ + chosen_index + ] + + test_S = set_gamma[i][int(chosen_index), :] + + modi_Sigma = Sigma.copy() + if np.size(test_S) > 1: + modi_ld_S = modi_Sigma[test_S][:, test_S] + + result = minimize_scalar( + CARMA._ridge_fun, + bounds=(0, 1), + args=( + Sigma, + modi_ld_S, + test_S, + z, + outlier_tau, + outlier_likelihood, + ), + method="bounded", + ) + modi_ld_S = result.x * modi_ld_S + ( + 1 - result.x + ) * np.eye(len(modi_ld_S)) + + modi_Sigma[np.ix_(test_S, test_S)] = modi_ld_S + + test_log_BF = outlier_likelihood( + test_S + 1, Sigma, z, outlier_tau, len(test_S) + ) - outlier_likelihood( + test_S + 1, + modi_Sigma, + z, + outlier_tau, + len(test_S), + ) + test_log_BF = -np.abs(test_log_BF) + + if np.exp(test_log_BF) < outlier_BF_index: + set_gamma[i] = np.delete( + set_gamma[i], + int(set_star["gamma_set_index"][i]), + axis=0, + ) + set_gamma_margin[i] = np.delete( + set_gamma_margin[i], + int(set_star["gamma_set_index"][i]), + axis=0, + ) + conditional_S = np.concatenate( + [conditional_S, np.setdiff1d(test_S, working_S)] + ) + conditional_S = ( + np.unique(conditional_S).astype(int).tolist() + ) + else: + break + + if len(working_S) == num_causal: + set_star = set_star.drop(1) + aa = set_star["margin"] - max(set_star["margin"]) + sec_sample = np.random.choice( + [0, 2], 1, p=np.exp(aa) / np.sum(np.exp(aa)) + ) + ind_sec = int( + set_star["gamma_set_index"][ + set_star["set_index"] == int(sec_sample) + ] + ) + S = set_gamma[sec_sample[0]][ind_sec].tolist() + else: + aa = set_star["margin"] - max(set_star["margin"]) + sec_sample = np.random.choice( + range(0, 3), 1, p=np.exp(aa) / np.sum(np.exp(aa)) + ) + S = set_gamma[sec_sample[0]][ + int(set_star["gamma_set_index"][sec_sample[0]]) + ].tolist() + + for item in conditional_S: + if item not in S: + S.append(item) + # END h_ind loop + # + if conditional_S is not None: + all_c_index = [] + index_array = [s.split(",") for s in B_list["matrix_gamma"]] + for tt in conditional_S: + tt_str = str(tt) + ind = [ + i for i, sublist in enumerate(index_array) if tt_str in sublist + ] + all_c_index.extend(ind) + + all_c_index = list(set(all_c_index)) + + if len(all_c_index) > 0: + temp_B_list = B_list.copy() + temp_B_list = B_list.drop(all_c_index) + else: + temp_B_list = B_list.copy() + else: + temp_B_list = B_list.copy() + + result_B_list = temp_B_list[: min(int(B), len(temp_B_list))] + + rb1 = result_B_list["set_gamma_margin"] + + difference = abs(rb1[: (len(rb1) // 4)].mean() - stored_bf) + + if difference < epsilon: + break + else: + stored_bf = rb1[: (len(rb1) // 4)].mean() + + out = {"B_list": result_B_list, "conditional_S_list": conditional_S} + + return out diff --git a/tests/conftest.py b/tests/conftest.py index 09991b4fd..29f90d7e5 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -5,6 +5,8 @@ import dbldatagen as dg import hail as hl +import numpy as np +import pandas as pd import pytest from otg.common.Liftover import LiftOverSpark from otg.common.session import Session @@ -609,3 +611,15 @@ def mock_l2g_predictions(spark: SparkSession) -> L2GPrediction: ).withSchema(schema) return L2GPrediction(_df=data_spec.build(), _schema=schema) + + +@pytest.fixture() +def sample_data_for_carma() -> list[np.ndarray]: + """Sample data for fine-mapping by CARMA.""" + ld = pd.read_csv("tests/data_samples/01_test_ld.csv", header=None) + ld = np.array(ld) + z = pd.read_csv("tests/data_samples/01_test_z.csv") + z = np.array(z.iloc[:, 1]) + pips = pd.read_csv("tests/data_samples/01_test_PIPs.txt") + pips = np.array(pips.iloc[:, 0]) + return [ld, z, pips] diff --git a/tests/data_samples/01_test_PIPs.txt b/tests/data_samples/01_test_PIPs.txt new file mode 100644 index 000000000..97369e389 --- /dev/null +++ b/tests/data_samples/01_test_PIPs.txt @@ -0,0 +1,22 @@ +PIP +0.88970960804825 +0.32918411642274 +0.0710009195947341 +0.481104314037781 +0.366748864559988 +0 +0.0332182074643238 +0.0426456172197931 +0.0401158455216852 +0.0460954571036408 +0.0275041169833703 +0.0376699507565114 +0.0348960097376561 +0.0356747489619599 +0.032741594571998 +0.0260361597088635 +0.0144548554437742 +0.0298980357084198 +0.0173927444347685 +0.0190199141231909 +0.0147391490166984 diff --git a/tests/data_samples/01_test_ld.csv b/tests/data_samples/01_test_ld.csv new file mode 100644 index 000000000..98ef7d7dc --- /dev/null +++ b/tests/data_samples/01_test_ld.csv @@ -0,0 +1,21 @@ +1,0.771180152061322,0.915096145535259,0.578676135360495,0.609181312173864,0.529277398860646,0.538305421923955,0.547685100857922,0.557750298960975,0.527989730181989,0.617832510579344,0.665556180143122,0.667991840837611,0.60264843482699,0.669108775854588,0.7064474740211,0.484413985057867,0.658231100907897,0.605253879999999,0.497201333151297,0.446028260596218 +0.771180152061322,1,0.77727075611386,0.552682624174407,0.58181754117183,0.505502825922267,0.514125319875,0.543913702114378,0.532696753896335,0.504272997939221,0.614165040578185,0.696788603725357,0.621163825411455,0.622686787787889,0.681772343971706,0.649743678314224,0.462654628537221,0.598943479013203,0.61334465995712,0.474867582672794,0.425993149637484 +0.915096145535259,0.77727075611386,1,0.577479008507665,0.607921078232143,0.528182464841331,0.537191811328613,0.550862221334237,0.556596462092198,0.526897460006331,0.624901653466727,0.680315414951671,0.666609943570256,0.609591091014596,0.675814335864293,0.704986022287415,0.483411861496926,0.652683936089198,0.612249138732174,0.496172755971753,0.445105546879074 +0.578676135360495,0.552682624174407,0.577479008507665,1,0.936516764898861,0.886419522629392,0.883539360549454,0.815054425259397,0.89015857597522,0.898287478639959,0.838248655204035,0.744557055988033,0.783367967202267,0.751988040768699,0.77508316651858,0.784583660547929,0.807725094487557,0.742426044370671,0.755754059769589,0.859204834568748,0.770773552495573 +0.609181312173864,0.58181754117183,0.607921078232143,0.936516764898861,1,0.854958496542078,0.851504297587258,0.791029211826282,0.856694225681807,0.866720169563076,0.834733885343502,0.760017201376208,0.774596562194539,0.752527570939104,0.788093184636649,0.794350666108285,0.75665727582634,0.74333276725865,0.756968561671187,0.816179556423083,0.732176532146998 +0.529277398860646,0.505502825922267,0.528182464841331,0.886419522629392,0.854958496542078,1,0.963898264597837,0.853713516710945,0.834573712868289,0.943978481039337,0.813959652160027,0.696149797263453,0.711039687400227,0.727859955813965,0.723631001344102,0.717921993042117,0.803742177386196,0.694948637689016,0.730383444841099,0.854317112108248,0.821546555224246 +0.538305421923955,0.514125319875,0.537191811328613,0.883539360549454,0.851504297587258,0.963898264597837,1,0.837222699518761,0.827898435017643,0.932460550448256,0.823537842612335,0.708130798640251,0.716153237106053,0.737417065645648,0.736188455310841,0.726481884590385,0.80930234445874,0.695825669546909,0.731184013149923,0.844127087659027,0.807561321528189 +0.547685100857922,0.543913702114378,0.550862221334237,0.815054425259397,0.791029211826282,0.853713516710945,0.837222699518761,1,0.775610538279335,0.822970043739792,0.859046438973893,0.753815555147918,0.68137087812584,0.807807766988614,0.788202904557297,0.701896810520296,0.76195748416893,0.670659290732083,0.820365957141738,0.790381010655166,0.752349050415205 +0.557750298960975,0.532696753896335,0.556596462092198,0.89015857597522,0.856694225681807,0.834573712868289,0.827898435017643,0.775610538279335,1,0.85122956121199,0.78256827181717,0.694872100391695,0.790735467600445,0.701238262514285,0.732687372087784,0.776383332122737,0.853625772873608,0.75338566937418,0.717680388453207,0.881639076169229,0.799691656691387 +0.527989730181989,0.504272997939221,0.526897460006331,0.898287478639959,0.866720169563076,0.943978481039337,0.932460550448256,0.822970043739792,0.85122956121199,1,0.793701799680761,0.690004496555049,0.731635213823588,0.707752264986572,0.721839633198947,0.733964201264005,0.810932477460108,0.706486482434112,0.728339418415153,0.881569561680494,0.8235799531949 +0.617832510579344,0.614165040578185,0.624901653466727,0.838248655204035,0.834733885343502,0.813959652160027,0.823537842612335,0.859046438973893,0.78256827181717,0.793701799680761,1,0.838728935703877,0.730058107563001,0.801622647280446,0.885835296731882,0.751225235287299,0.720090008814676,0.712330022715523,0.806941386442907,0.749762076154044,0.688659095652229 +0.665556180143122,0.696788603725357,0.680315414951671,0.744557055988033,0.760017201376208,0.696149797263453,0.708130798640251,0.753815555147918,0.694872100391695,0.690004496555049,0.838728935703877,1,0.690877299131653,0.835204883519753,0.907693233980355,0.727512311958015,0.622737806640536,0.662524522572879,0.83868713968573,0.644451657638191,0.585569862432858 +0.667991840837611,0.621163825411455,0.666609943570256,0.783367967202267,0.774596562194539,0.711039687400227,0.716153237106053,0.68137087812584,0.790735467600445,0.731635213823588,0.730058107563001,0.690877299131653,1,0.676934005738399,0.734343307812905,0.899933405504758,0.715772451454926,0.882444836505259,0.686555132047416,0.716453568744335,0.657893481460993 +0.60264843482699,0.622686787787889,0.609591091014596,0.751988040768699,0.752527570939104,0.727859955813965,0.737417065645648,0.807807766988614,0.701238262514285,0.707752264986572,0.801622647280446,0.835204883519753,0.676934005738399,1,0.817201445327388,0.696633769335204,0.657611779864444,0.642714199636698,0.947439500661142,0.677716616058044,0.631200367769011 +0.669108775854588,0.681772343971706,0.675814335864293,0.77508316651858,0.788093184636649,0.723631001344102,0.736188455310841,0.788202904557297,0.732687372087784,0.721839633198947,0.885835296731882,0.907693233980355,0.734343307812905,0.817201445327388,1,0.765000546116046,0.656530931255051,0.701409988164336,0.828291978937351,0.678983858053719,0.612477024935031 +0.7064474740211,0.649743678314224,0.704986022287415,0.784583660547929,0.794350666108285,0.717921993042117,0.726481884590385,0.701896810520296,0.776383332122737,0.733964201264005,0.751225235287299,0.727512311958015,0.899933405504758,0.696633769335204,0.765000546116046,1,0.685704178826746,0.833758315351893,0.711196796654639,0.699208668970601,0.631367903486758 +0.484413985057867,0.462654628537221,0.483411861496926,0.807725094487557,0.75665727582634,0.803742177386196,0.80930234445874,0.76195748416893,0.853625772873608,0.810932477460108,0.720090008814676,0.622737806640536,0.715772451454926,0.657611779864444,0.656530931255051,0.685704178826746,1,0.698311095599307,0.668195726288336,0.885683784414127,0.90422815420025 +0.658231100907897,0.598943479013203,0.652683936089198,0.742426044370671,0.74333276725865,0.694948637689016,0.695825669546909,0.670659290732083,0.75338566937418,0.706486482434112,0.712330022715523,0.662524522572879,0.882444836505259,0.642714199636698,0.701409988164336,0.833758315351893,0.698311095599307,1,0.65211701190785,0.694760576183239,0.658587559031874 +0.605253879999999,0.61334465995712,0.612249138732174,0.755754059769589,0.756968561671187,0.730383444841099,0.731184013149923,0.820365957141738,0.717680388453207,0.728339418415153,0.806941386442907,0.83868713968573,0.686555132047416,0.947439500661142,0.828291978937351,0.711196796654639,0.668195726288336,0.65211701190785,1,0.697918036358423,0.641031039844369 +0.497201333151297,0.474867582672794,0.496172755971753,0.859204834568748,0.816179556423083,0.854317112108248,0.844127087659027,0.790381010655166,0.881639076169229,0.881569561680494,0.749762076154044,0.644451657638191,0.716453568744335,0.677716616058044,0.678983858053719,0.699208668970601,0.885683784414127,0.694760576183239,0.697918036358423,1,0.897077764794515 +0.446028260596218,0.425993149637484,0.445105546879074,0.770773552495573,0.732176532146998,0.821546555224246,0.807561321528189,0.752349050415205,0.799691656691387,0.8235799531949,0.688659095652229,0.585569862432858,0.657893481460993,0.631200367769011,0.612477024935031,0.631367903486758,0.90422815420025,0.658587559031874,0.641031039844369,0.897077764794515,1 diff --git a/tests/data_samples/01_test_z.csv b/tests/data_samples/01_test_z.csv new file mode 100644 index 000000000..fbbc8ae69 --- /dev/null +++ b/tests/data_samples/01_test_z.csv @@ -0,0 +1,22 @@ +snp_names,z +snp1,11.1804889503043 +snp2,10.7091564538033 +snp3,10.8047694726093 +snp4,10.2906723065847 +snp5,10.4679212690275 +snp6,-9.45046087299768 +snp7,9.50300649465759 +snp8,9.5237815039053 +snp9,9.54169575943337 +snp10,9.34599264627289 +snp11,9.96019948433531 +snp12,10.2860468030192 +snp13,10.2692195034972 +snp14,9.72102110742257 +snp15,10.2228843197938 +snp16,10.4338783929356 +snp17,8.47548514034471 +snp18,9.92845677961064 +snp19,9.53893003873459 +snp20,8.6241266487016 +snp21,8.11595856074091 diff --git a/tests/method/test_carma.py b/tests/method/test_carma.py new file mode 100644 index 000000000..46b93eee5 --- /dev/null +++ b/tests/method/test_carma.py @@ -0,0 +1,56 @@ +"""Test of main CARMA functions.""" + +from __future__ import annotations + +import numpy as np +from otg.method.carma import CARMA + + +class TestCARMA: + """Test of CARMA main functions.""" + + def test_CARMA_spike_slab_noEM_pips( + self: TestCARMA, sample_data_for_carma: list[np.ndarray] + ) -> None: + """Test of CARMA PIPs.""" + ld = sample_data_for_carma[0] + z = sample_data_for_carma[1] + pips = sample_data_for_carma[2] + _l = CARMA.CARMA_spike_slab_noEM(z=z, ld=ld) + assert np.allclose(np.round(np.corrcoef(_l["PIPs"], pips)[0, 1], decimals=2), 1) + + def test_CARMA_spike_slab_noEM_outliers( + self: TestCARMA, sample_data_for_carma: list[np.ndarray] + ) -> None: + """Test of CARMA outliers detection.""" + ld = sample_data_for_carma[0] + z = sample_data_for_carma[1] + + _l = CARMA.CARMA_spike_slab_noEM(z=z, ld=ld) + assert np.allclose(_l["Outliers"], 5) + + def test_MCS_modified( + self: TestCARMA, sample_data_for_carma: list[np.ndarray] + ) -> None: + """Test of MCS_modified and PIP_func functions on PIPs estiamtion.""" + ld = sample_data_for_carma[0] + z = sample_data_for_carma[1] + pips = sample_data_for_carma[2] + + l1 = CARMA._MCS_modified( + z=z, + ld_matrix=ld, + outlier_BF_index=1 / 3.2, + input_conditional_S_list=None, + lambda_val=1, + epsilon=1e-5 * 21, + outlier_switch=True, + tau=0.04, + ) + l1_pips = CARMA._PIP_func( + likeli=l1["B_list"]["set_gamma_margin"], + model_space=l1["B_list"]["matrix_gamma"], + p=21, + num_causal=10, + ) + assert np.allclose(np.round(np.corrcoef(l1_pips, pips)[0, 1], decimals=2), 1) From ac1064f85051a23056dc7735320c8561cf5ace74 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Irene=20L=C3=B3pez?= <45119610+ireneisdoomed@users.noreply.github.com> Date: Fri, 5 Jan 2024 13:27:09 +0100 Subject: [PATCH 21/21] chore: change picsed finngen outputh path (#385) to align it with `gwas_catalog_preprocess.py` --- src/airflow/dags/finngen_preprocess.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/airflow/dags/finngen_preprocess.py b/src/airflow/dags/finngen_preprocess.py index 3ca5f3907..a9f405daf 100644 --- a/src/airflow/dags/finngen_preprocess.py +++ b/src/airflow/dags/finngen_preprocess.py @@ -16,7 +16,7 @@ f"{RELEASEBUCKET}/study_locus/from_sumstats_study_locus_window_clumped/finngen" ) LD_CLUMPED = f"{RELEASEBUCKET}/study_locus/from_sumstats_study_locus_ld_clumped/finngen" -PICSED = f"{RELEASEBUCKET}/credible_set/from_sumstats_study_locus/finngen" +PICSED = f"{RELEASEBUCKET}/credible_set/from_sumstats/finngen" with DAG( dag_id=Path(__file__).stem,